Atomstrom/atomstrom.py

217 lines
6.9 KiB
Python
Raw Normal View History

2010-10-26 21:02:37 +00:00
#!/usr/bin/env python
from sqlalchemy import create_engine, Table, Column, Integer, Text, Boolean, DateTime, MetaData, ForeignKey
from sqlalchemy.orm import sessionmaker, relation, backref
from sqlalchemy.ext.declarative import declarative_base
import datetime
import feedparser
import re
import sys
2010-10-29 07:09:25 +00:00
import urllib
2010-10-29 22:21:24 +00:00
import hn
2010-10-26 21:02:37 +00:00
Base = declarative_base()
class Feed(Base):
__tablename__ = 'feed'
id = Column(Integer, primary_key=True)
url = Column(Text)
daily = Column(Boolean)
readability = Column(Boolean)
fullpage = Column(Boolean)
2010-10-26 21:02:37 +00:00
enabled = Column(Boolean)
def __init__(self, url, daily, readability, fullpage, enabled):
2010-10-26 21:02:37 +00:00
self.url = url
self.daily = daily
self.readability = readability
self.fullpage = fullpage
2010-10-26 21:02:37 +00:00
self.enabled = enabled
def __repr__(self):
return "<Feed('%s','%s','%s')>" % (self.url, self.daily, self.readability)
class Feedinfo(Base):
__tablename__ = 'feedinfo'
id = Column(Integer, primary_key=True)
feed_id = Column(Integer, ForeignKey('feed.id'))
feed = relation("Feed", backref=backref('feedinfo', uselist=False))
title = Column(Text)
link = Column(Text)
subtitle = Column(Text)
author = Column(Text)
publisher = Column(Text)
status = Column(Integer)
version = Column(Text)
encoding = Column(Text)
bozo = Column(Integer)
2010-10-26 21:02:37 +00:00
lastfetched = Column(DateTime)
lastsuccessful = Column(DateTime)
def __init__(self, parser):
self.update(parser)
def __repr__(self):
return "<Feedinfo('%s','%s','%s')>" % (self.title, self.subtitle, self.author)
def update(self, parser):
if parser.feed.has_key('title'):
self.title = parser.feed.get('title').encode('latin-1', 'replace')
if parser.feed.has_key('link'):
self.link = parser.feed.get('link')
if parser.feed.has_key('subtitle'):
self.subtitle = parser.feed.get('subtitle').encode('latin-1', 'replace')
if parser.feed.has_key('author'):
self.author = parser.feed.get('author').encode('latin-1', 'replace')
if parser.feed.has_key('publisher'):
self.author = parser.feed.get('publisher').encode('latin-1', 'replace')
self.status = parser.get('status')
self.version = parser.get('version')
self.encoding = parser.get('encoding')
self.bozo = parser.get('bozo')
self.lastfetched = datetime.datetime.now()
if parser.get('status') == 200:
self.lastsuccessful = datetime.datetime.now()
class Entry(Base):
__tablename__ = 'entry'
id = Column(Integer, primary_key=True)
feed_id = Column(Integer, ForeignKey('feed.id'))
feed = relation("Feed", backref=backref('entry'))
title = Column(Text)
link = Column(Text)
summary = Column(Text)
content = Column(Text)
author = Column(Text)
enclosures = Column(Text)
fullpage = Column(Text)
2010-10-29 22:21:24 +00:00
readability = Column(Text)
2010-10-26 21:02:37 +00:00
lastfetched = Column(DateTime)
sent = Column(DateTime)
2010-10-26 21:02:37 +00:00
def __init__(self, entry):
self.update(entry)
def __repr__(self):
return "<Entry('%s','%s','%s')>" % (self.title, "", "")
def update(self, entry):
if entry.has_key('title'):
self.title = entry.get('title').encode('latin-1', 'replace')
if entry.has_key('link'):
self.link = entry.get('link').encode('latin-1', 'replace')
if entry.has_key('summary'):
self.summary = entry.get('summary').encode('latin-1', 'replace')
if entry.has_key('content'):
self.content = entry.get('content').encode('latin-1', 'replace')
if entry.has_key('author'):
self.author = entry.get('author').encode('latin-1', 'replace')
if entry.has_key('enclosures'):
self.enclosures = entry.get('enclosures').encode('latin-1', 'replace')
self.lastfetched = datetime.datetime.now()
engine = create_engine('mysql://atomstrom:mdRTR4b8PLDqRSA4@localhost/atomstrom')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
#session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1))
#session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1))
2010-10-26 21:02:37 +00:00
2010-10-30 09:16:37 +00:00
def send_mail(sender, subject, body):
print 'Sender: %s' % sender.decode('latin-1')
print 'Subject: %s' % subject.decode('latin-1')
print 'Body: %s' % body.decode('latin-1')
def mail_daily_digest():
pass
def mail_single_entry(feed, feedinfo, entry):
sender = feedinfo.title
subject = entry.title
if entry.readability:
body = entry.readability
elif entry.fullpage:
body = entry.fullpage
elif entry.summary:
body = entry.summary
else:
body = 'no text, sorry'
body = body + '\n\n'
body = body + 'site: [%s]\n' % feedinfo.link
body = body + 'link: [%s]\n' % entry.link
send_mail(sender, subject, body)
def mail_single_entries():
print "mailing single entries..."
for feed, feedinfo, entry in session.query(Feed, Feedinfo, Entry).filter(Feed.id==Feedinfo.feed_id).filter(Feed.id==Entry.feed_id).filter(Feed.enabled==1).filter(Feed.daily==0).all():
mail_single_entry(feed, feedinfo, entry)
2010-10-29 22:21:24 +00:00
def fetch_readability(link):
text = hn.upgradeLink(link)
text = text.decode('utf8')
return text
2010-10-29 07:09:25 +00:00
def fetch_full_page(link):
opener = urllib.FancyURLopener({})
response = opener.open(link)
return response.read()
def process_feed_entry(feed, entry):
query = session.query(Entry).filter_by(feed_id=feed.id, title=entry.title.encode('latin-1', 'replace'))
try:
thisentry = query.one()
thisentry.update(entry)
return "-"
except Exception, e:
2010-10-29 07:09:25 +00:00
thisentry = Entry(entry)
2010-10-30 09:16:37 +00:00
if feed.fullpage:
2010-10-29 07:09:25 +00:00
thisentry.fullpage = fetch_full_page(entry.link)
2010-10-30 09:16:37 +00:00
if feed.readability:
2010-10-29 22:21:24 +00:00
thisentry.readability = fetch_readability(entry.link)
2010-10-29 07:09:25 +00:00
feed.entry.append(thisentry)
return "+"
def fetch_single_feed(feed):
2010-10-26 21:02:37 +00:00
print "fetching %s" % feed.url
parser = feedparser.parse(feed.url)
print "processing feed info..."
2010-10-26 21:02:37 +00:00
query = session.query(Feedinfo).filter(Feedinfo.feed_id==feed.id)
try:
feed.feedinfo = query.one()
feed.feedinfo.update(parser)
except Exception, e:
feed.feedinfo = Feedinfo(parser)
print "processing feed entries: ",
entries_new = 0
entries_total = 0
2010-10-26 21:02:37 +00:00
for entry in parser.entries:
entries_total = entries_total + 1
ret = process_feed_entry(feed, entry)
if ret == "+":
entries_new = entries_new + 1
sys.stdout.write(ret)
print " (%d/%d new)" % (entries_new, entries_total)
def fetch_all_feeds():
print "fetching all feeds..."
for feed in session.query(Feed).filter_by(enabled=1).order_by(Feed.id):
fetch_single_feed(feed)
print
if __name__ == "__main__":
fetch_all_feeds()
2010-10-30 09:16:37 +00:00
mail_single_entries()
2010-10-26 21:02:37 +00:00
session.commit()