Atomstrom/atomstrom.py

311 lines
11 KiB
Python
Executable File

#!/usr/bin/env python
from sqlalchemy import create_engine, Table, Column, Integer, Text, Boolean, DateTime, MetaData, ForeignKey, desc
from sqlalchemy.orm import sessionmaker, relation, backref
from sqlalchemy.ext.declarative import declarative_base
from datetime import datetime, timedelta
from time import mktime
import feedparser
import re
import sys
import urllib
import hn
import html2text
import ConfigParser
import pprint
import smtplib
from email.mime.text import MIMEText
from optparse import OptionParser
Base = declarative_base()
class Feed(Base):
__tablename__ = 'feed'
id = Column(Integer, primary_key=True)
url = Column(Text)
frequency = Column(Integer)
daily = Column(Boolean)
readability = Column(Boolean)
fullpage = Column(Boolean)
html2textsummary = Column(Boolean)
enabled = Column(Boolean)
def __init__(self, url, daily, readability, fullpage, enabled, html2textsummary):
self.url = url
self.daily = daily
self.readability = readability
self.fullpage = fullpage
self.html2textsummary = html2textsummary
self.enabled = enabled
def __repr__(self):
return "<Feed('%s','%s','%s')>" % (self.url, self.daily, self.readability)
class Feedinfo(Base):
__tablename__ = 'feedinfo'
id = Column(Integer, primary_key=True)
feed_id = Column(Integer, ForeignKey('feed.id'))
feed = relation("Feed", backref=backref('feedinfo', uselist=False))
title = Column(Text)
link = Column(Text)
subtitle = Column(Text)
author = Column(Text)
publisher = Column(Text)
status = Column(Integer)
version = Column(Text)
encoding = Column(Text)
bozo = Column(Integer)
lastfetched = Column(DateTime)
lastsuccessful = Column(DateTime)
def __init__(self, parser):
self.update(parser)
def __repr__(self):
return "<Feedinfo('%s','%s','%s')>" % (self.title, self.subtitle, self.author)
def update(self, parser):
if parser.feed.has_key('title'):
self.title = parser.feed.get('title').encode('latin-1', 'replace')
if parser.feed.has_key('link'):
self.link = parser.feed.get('link')
if parser.feed.has_key('subtitle'):
self.subtitle = parser.feed.get('subtitle').encode('latin-1', 'replace')
if parser.feed.has_key('author'):
self.author = parser.feed.get('author').encode('latin-1', 'replace')
if parser.feed.has_key('publisher'):
self.author = parser.feed.get('publisher').encode('latin-1', 'replace')
self.status = parser.get('status')
self.version = parser.get('version')
self.encoding = parser.get('encoding')
self.bozo = parser.get('bozo')
self.lastfetched = datetime.now()
if parser.get('status') == 200 or parser.get('status') == 302:
self.lastsuccessful = datetime.now()
class Entry(Base):
__tablename__ = 'entry'
id = Column(Integer, primary_key=True)
feed_id = Column(Integer, ForeignKey('feed.id'))
feed = relation("Feed", backref=backref('entry'))
title = Column(Text)
link = Column(Text)
summary = Column(Text)
content = Column(Text)
author = Column(Text)
enclosures = Column(Text)
fullpage = Column(Text)
readability = Column(Text)
updated = Column(DateTime)
firstfetched = Column(DateTime)
lastfetched = Column(DateTime)
sent = Column(DateTime)
def __init__(self, entry):
self.update(entry)
self.firstfetched = datetime.now()
def __repr__(self):
return "<Entry('%s','%s','%s')>" % (self.title, "", "")
def update(self, entry):
if entry.has_key('title'):
self.title = entry.get('title').encode('latin-1', 'replace')
if entry.has_key('link'):
self.link = entry.get('link').encode('latin-1', 'replace')
if entry.has_key('summary'):
self.summary = entry.get('summary').encode('latin-1', 'replace')
if entry.has_key('content'):
self.content = entry.get('content')[0].value.encode('latin-1', 'replace')
if entry.has_key('author'):
self.author = entry.get('author').encode('latin-1', 'replace')
if entry.has_key('updated_parsed'):
updated_parsed = entry.get('updated_parsed')
self.updated = datetime.fromtimestamp(mktime(updated_parsed))
if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0:
print 'enclosures';
pp=pprint.PrettyPrinter(depth=4)
pp.pprint(entry.get('enclosures'))
#self.enclosures = entry.get('enclosures').encode('latin-1', 'replace')
self.lastfetched = datetime.now()
def send_mail(sender, receiver, subject, body):
print 'sending to %s: %s' % (receiver.decode('latin-1'), subject.decode('latin-1'))
mail = MIMEText(body)
mail['From'] = sender
mail['To'] = receiver
mail['Subject'] = subject
mailserver = smtplib.SMTP('localhost')
mailserver.sendmail(sender, [receiver], mail.as_string())
mailserver.quit()
def get_entry_text(entry):
if entry.readability:
text = entry.readability
elif entry.fullpage:
text = entry.fullpage
elif entry.summary:
text = entry.summary
else:
text = 'no text, sorry'
return text
def mail_daily_digest(session, sender, receiver, prefix):
print 'mailing daily digest...'
body = ''
count = 0
for feed, feedinfo, entry in session.query(Feed, Feedinfo, Entry).filter(Feed.id==Feedinfo.feed_id).filter(Feed.id==Entry.feed_id).filter(Feed.enabled==1).filter(Feed.daily==1).order_by(desc(Entry.updated)).all():
count = count + 1
body = body + '=> %s - %s\n' % (entry.updated.strftime('%y%m%d-%H%M'), feedinfo.title)
body = body + ' %s\n' % entry.title
body = body + '%s\n' % get_entry_text(entry)[0:100]
body = body + '%s\n\n' % entry.link
today = datetime.now()
subject = '%s (%s) - %d entries' % (today.strftime('%y%m%d'), today.strftime('%A'), count)
if prefix != '':
subject = '%s %s' % (prefix, subject)
send_mail(sender, receiver, subject, body)
def mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix):
subject = '%s' % (entry.title)
if prefix != '':
subject = '%s %s' % (prefix, subject)
body = '%s\n\n' % get_entry_text(entry)
body = body + '%s\n' % feedinfo.link
body = body + '%s\n' % entry.link
send_mail(sender, receiver, subject, body)
def mail_single_entries(session, sender, receiver, prefix):
print 'mailing single entries...'
for feed, feedinfo, entry in session.query(Feed, Feedinfo, Entry).filter(Feed.id==Feedinfo.feed_id).filter(Feed.id==Entry.feed_id).filter(Feed.enabled==1).filter(Feed.daily==0).all():
mail_single_entry(feed, feedinfo, entry, sender, receiver, prefix)
def fetch_readability(link):
text = hn.upgradeLink(link)
text = text.decode('utf8')
return text
def fetch_full_page(link):
opener = urllib.FancyURLopener({})
response = opener.open(link)
html = response.read()
html = html.decode('utf8')
text = html2text.html2text(html)
return text.encode('latin-1', 'replace')
def process_feed_entry(session, feed, entry):
#query = session.query(Entry).filter_by(feed_id=feed.id, title=entry.title.encode('latin-1', 'replace'))
title = entry.title.encode('latin-1', 'replace')
link = entry.link.encode('latin-1', 'replace')
query = session.query(Entry).filter(Entry.feed_id==feed.id).filter(Entry.title==title).filter(Entry.link==link)
try:
thisentry = query.one()
thisentry.update(entry)
print ' entry already known <%s>' % entry.title
return 0
except Exception, e:
print ' new entry <%s>' % entry.title
thisentry = Entry(entry)
if feed.fullpage:
print ' fetching full page <%s>' % entry.link
thisentry.fullpage = fetch_full_page(entry.link)
if feed.readability:
print ' fetching readability <%s>' % entry.link
thisentry.readability = fetch_readability(entry.link)
if feed.html2textsummary:
print ' converting summary'
summary = thisentry.summary.decode('latin-1')
summary = html2text.html2text(summary)
thisentry.summary = summary.encode('latin-1', 'replace')
feed.entry.append(thisentry)
return 1
def fetch_single_feed(session, feed):
print 'processing %s' % feed.url
query = session.query(Feedinfo).filter(Feedinfo.feed_id==feed.id)
fetched = False
try:
feed.feedinfo = query.one()
nextfetch = (feed.feedinfo.lastfetched + timedelta(minutes=feed.frequency))
if datetime.now() > nextfetch:
print 'fetching...'
parser = feedparser.parse(feed.url)
fetched = True
feed.feedinfo.update(parser)
else:
print 'not fetching before: %s' % nextfetch
except Exception, e:
print 'this feed seems to be new'
print 'fetching...'
parser = feedparser.parse(feed.url)
fetched = True
feed.feedinfo = Feedinfo(parser)
if fetched:
print 'processing feed entries:'
entries_new = 0
entries_total = 0
for entry in parser.entries:
entries_total = entries_total + 1
entries_new = entries_new + process_feed_entry(session, feed, entry)
session.commit()
print 'updated %d of %d entries' % (entries_new, entries_total)
def fetch_all_feeds(session):
print 'fetching all feeds...'
for feed in session.query(Feed).filter_by(enabled=1).order_by(Feed.id):
fetch_single_feed(session, feed)
print
if __name__ == '__main__':
config = ConfigParser.ConfigParser()
config.read('atomstrom.conf')
dbconnectstring = '%s://%s:%s@%s/%s' % (
config.get('database', 'engine'),
config.get('database', 'user'),
config.get('database', 'password'),
config.get('database', 'hostname'),
config.get('database', 'database'),
)
engine = create_engine(dbconnectstring)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
#session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1, 1))
#session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1, 1))
parser = OptionParser()
parser.add_option("-f", "--fetch", action="store_true", dest="fetch", default=False, help="fetch all feeds")
parser.add_option("-s", "--single", action="store_true", dest="single", default=False, help="send single mails")
parser.add_option("-d", "--daily", action="store_true", dest="daily", default=False, help="send daily digest")
(options, args) = parser.parse_args()
if options.fetch:
fetch_all_feeds(session)
if options.single:
sender = config.get('email', 'sender')
receiver = config.get('email', 'receiver')
prefix = config.get('email', 'prefix_single')
mail_single_entries(session, sender, receiver, prefix)
if options.daily:
sender = config.get('email', 'sender')
receiver = config.get('email', 'receiver')
prefix = config.get('email', 'prefix_digest')
mail_daily_digest(session, sender, receiver, prefix)
if not (options.fetch or options.single or options.daily):
parser.print_help()
session.commit()