Atomstrom/atomstrom.py

275 lines
9.7 KiB
Python
Raw Normal View History

2010-10-26 23:02:37 +02:00
#!/usr/bin/env python
from sqlalchemy import create_engine, Table, Column, Integer, Text, Boolean, DateTime, MetaData, ForeignKey, desc
2010-10-26 23:02:37 +02:00
from sqlalchemy.orm import sessionmaker, relation, backref
from sqlalchemy.ext.declarative import declarative_base
import datetime
import feedparser
import re
import sys
2010-10-29 09:09:25 +02:00
import urllib
2010-10-30 00:21:24 +02:00
import hn
import html2text
import ConfigParser
from optparse import OptionParser
2010-10-26 23:02:37 +02:00
Base = declarative_base()
class Feed(Base):
__tablename__ = 'feed'
id = Column(Integer, primary_key=True)
url = Column(Text)
daily = Column(Boolean)
readability = Column(Boolean)
fullpage = Column(Boolean)
html2textsummary = Column(Boolean)
2010-10-26 23:02:37 +02:00
enabled = Column(Boolean)
def __init__(self, url, daily, readability, fullpage, enabled, html2textsummary):
2010-10-26 23:02:37 +02:00
self.url = url
self.daily = daily
self.readability = readability
self.fullpage = fullpage
self.html2textsummary = html2textsummary
2010-10-26 23:02:37 +02:00
self.enabled = enabled
def __repr__(self):
return "<Feed('%s','%s','%s')>" % (self.url, self.daily, self.readability)
class Feedinfo(Base):
__tablename__ = 'feedinfo'
id = Column(Integer, primary_key=True)
feed_id = Column(Integer, ForeignKey('feed.id'))
feed = relation("Feed", backref=backref('feedinfo', uselist=False))
title = Column(Text)
link = Column(Text)
subtitle = Column(Text)
author = Column(Text)
publisher = Column(Text)
status = Column(Integer)
version = Column(Text)
encoding = Column(Text)
bozo = Column(Integer)
2010-10-26 23:02:37 +02:00
lastfetched = Column(DateTime)
lastsuccessful = Column(DateTime)
def __init__(self, parser):
self.update(parser)
def __repr__(self):
return "<Feedinfo('%s','%s','%s')>" % (self.title, self.subtitle, self.author)
def update(self, parser):
if parser.feed.has_key('title'):
self.title = parser.feed.get('title').encode('latin-1', 'replace')
if parser.feed.has_key('link'):
self.link = parser.feed.get('link')
if parser.feed.has_key('subtitle'):
self.subtitle = parser.feed.get('subtitle').encode('latin-1', 'replace')
if parser.feed.has_key('author'):
self.author = parser.feed.get('author').encode('latin-1', 'replace')
if parser.feed.has_key('publisher'):
self.author = parser.feed.get('publisher').encode('latin-1', 'replace')
self.status = parser.get('status')
self.version = parser.get('version')
self.encoding = parser.get('encoding')
self.bozo = parser.get('bozo')
self.lastfetched = datetime.datetime.now()
if parser.get('status') == 200:
self.lastsuccessful = datetime.datetime.now()
class Entry(Base):
__tablename__ = 'entry'
id = Column(Integer, primary_key=True)
feed_id = Column(Integer, ForeignKey('feed.id'))
feed = relation("Feed", backref=backref('entry'))
title = Column(Text)
link = Column(Text)
summary = Column(Text)
content = Column(Text)
author = Column(Text)
enclosures = Column(Text)
fullpage = Column(Text)
2010-10-30 00:21:24 +02:00
readability = Column(Text)
firstfetched = Column(DateTime)
2010-10-26 23:02:37 +02:00
lastfetched = Column(DateTime)
sent = Column(DateTime)
2010-10-26 23:02:37 +02:00
def __init__(self, entry):
self.update(entry)
self.firstfetched = datetime.datetime.now()
2010-10-26 23:02:37 +02:00
def __repr__(self):
return "<Entry('%s','%s','%s')>" % (self.title, "", "")
def update(self, entry):
if entry.has_key('title'):
self.title = entry.get('title').encode('latin-1', 'replace')
if entry.has_key('link'):
self.link = entry.get('link').encode('latin-1', 'replace')
if entry.has_key('summary'):
self.summary = entry.get('summary').encode('latin-1', 'replace')
if entry.has_key('content'):
self.content = entry.get('content').encode('latin-1', 'replace')
if entry.has_key('author'):
self.author = entry.get('author').encode('latin-1', 'replace')
if entry.has_key('enclosures'):
self.enclosures = entry.get('enclosures').encode('latin-1', 'replace')
self.lastfetched = datetime.datetime.now()
2010-10-30 11:16:37 +02:00
def send_mail(sender, subject, body):
print 'Sender: %s' % sender.decode('latin-1')
print 'Subject: %s' % subject.decode('latin-1')
print 'Body: %s' % body.decode('latin-1')
def get_entry_text(entry):
if entry.readability:
text = entry.readability
elif entry.fullpage:
text = entry.fullpage
elif entry.summary:
text = entry.summary
else:
text = 'no text, sorry'
return text
def mail_daily_digest(session):
print 'mailing daily digest...'
sender = 'atomstrom'
body = ''
count = 0
for feed, feedinfo, entry in session.query(Feed, Feedinfo, Entry).filter(Feed.id==Feedinfo.feed_id).filter(Feed.id==Entry.feed_id).filter(Feed.enabled==1).filter(Feed.daily==1).order_by(desc(Entry.firstfetched)).all():
count = count + 1
body = body + '=> %s - %s\n' % (entry.firstfetched.strftime('%y%m%d-%H%M'), feedinfo.title)
body = body + ' %s\n' % entry.title
body = body + get_entry_text(entry)[0:100]
body = body + '\n'
body = body + 'link: [%s]\n\n' % entry.link
today = datetime.datetime.now()
subject = '[atomstrom] %s (%s) - %d entries' % (today.strftime('%y%m%d'), today.strftime('%A'), count)
send_mail(sender, subject, body)
2010-10-30 11:16:37 +02:00
def mail_single_entry(feed, feedinfo, entry):
sender = feedinfo.title
subject = entry.title
body = get_entry_text(entry)
2010-10-30 11:16:37 +02:00
body = body + '\n\n'
body = body + 'site: [%s]\n' % feedinfo.link
body = body + 'link: [%s]\n' % entry.link
send_mail(sender, subject, body)
def mail_single_entries(session):
print 'mailing single entries...'
2010-10-30 11:16:37 +02:00
for feed, feedinfo, entry in session.query(Feed, Feedinfo, Entry).filter(Feed.id==Feedinfo.feed_id).filter(Feed.id==Entry.feed_id).filter(Feed.enabled==1).filter(Feed.daily==0).all():
mail_single_entry(feed, feedinfo, entry)
2010-10-30 00:21:24 +02:00
def fetch_readability(link):
text = hn.upgradeLink(link)
text = text.decode('utf8')
return text
2010-10-29 09:09:25 +02:00
def fetch_full_page(link):
opener = urllib.FancyURLopener({})
response = opener.open(link)
html = response.read()
html = html.decode('utf8')
text = html2text.html2text(html)
return text.encode('latin-1', 'replace')
2010-10-29 09:09:25 +02:00
def process_feed_entry(session, feed, entry):
#query = session.query(Entry).filter_by(feed_id=feed.id, title=entry.title.encode('latin-1', 'replace'))
title = entry.title.encode('latin-1', 'replace')
link = entry.link.encode('latin-1', 'replace')
query = session.query(Entry).filter(Entry.feed_id==feed.id).filter(Entry.title==title).filter(Entry.link==link)
try:
thisentry = query.one()
thisentry.update(entry)
print ' entry already known <%s>' % entry.title
return 0
except Exception, e:
print ' new entry <%s>' % entry.title
2010-10-29 09:09:25 +02:00
thisentry = Entry(entry)
2010-10-30 11:16:37 +02:00
if feed.fullpage:
print ' fetching full page <%s>' % entry.link
2010-10-29 09:09:25 +02:00
thisentry.fullpage = fetch_full_page(entry.link)
2010-10-30 11:16:37 +02:00
if feed.readability:
print ' fetching readability <%s>' % entry.link
2010-10-30 00:21:24 +02:00
thisentry.readability = fetch_readability(entry.link)
if feed.html2textsummary:
print ' converting summary'
summary = thisentry.summary.decode('latin-1')
summary = html2text.html2text(summary)
thisentry.summary = summary.encode('latin-1', 'replace')
2010-10-29 09:09:25 +02:00
feed.entry.append(thisentry)
return 1
def fetch_single_feed(session, feed):
print 'fetching %s' % feed.url
2010-10-26 23:02:37 +02:00
parser = feedparser.parse(feed.url)
print 'processing feed info...'
2010-10-26 23:02:37 +02:00
query = session.query(Feedinfo).filter(Feedinfo.feed_id==feed.id)
try:
feed.feedinfo = query.one()
feed.feedinfo.update(parser)
except Exception, e:
print 'this feed seems to be new'
2010-10-26 23:02:37 +02:00
feed.feedinfo = Feedinfo(parser)
print 'processing feed entries:'
entries_new = 0
entries_total = 0
2010-10-26 23:02:37 +02:00
for entry in parser.entries:
entries_total = entries_total + 1
entries_new = entries_new + process_feed_entry(session, feed, entry)
session.commit()
print 'updated %d of %d entries' % (entries_new, entries_total)
def fetch_all_feeds(session):
print 'fetching all feeds...'
for feed in session.query(Feed).filter_by(enabled=1).order_by(Feed.id):
fetch_single_feed(session, feed)
print
if __name__ == '__main__':
config = ConfigParser.ConfigParser()
config.read('atomstrom.conf')
dbconnectstring = '%s://%s:%s@%s/%s' % (
config.get('database', 'engine'),
config.get('database', 'user'),
config.get('database', 'password'),
config.get('database', 'hostname'),
config.get('database', 'database'),
)
engine = create_engine(dbconnectstring)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
#session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1, 1))
#session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1, 1))
parser = OptionParser()
parser.add_option("-f", "--fetch", action="store_true", dest="fetch", default=False, help="fetch all feeds")
parser.add_option("-s", "--single", action="store_true", dest="single", default=False, help="send single mails")
parser.add_option("-d", "--daily", action="store_true", dest="daily", default=False, help="send daily digest")
(options, args) = parser.parse_args()
if options.fetch:
fetch_all_feeds(session)
if options.single:
mail_single_entries(session)
if options.daily:
mail_daily_digest(session)
2010-10-26 23:02:37 +02:00
session.commit()