refactored processing of entries, enabled processing of enclosures
This commit is contained in:
parent
5b9cc700f8
commit
2718e6502c
58
atomstrom.py
58
atomstrom.py
@ -10,11 +10,6 @@ from ddate import ddate
|
|||||||
import feedparser
|
import feedparser
|
||||||
import sys
|
import sys
|
||||||
import codecs
|
import codecs
|
||||||
#import urllib
|
|
||||||
import urllib2
|
|
||||||
#import hn
|
|
||||||
import html2text
|
|
||||||
import HTMLParser
|
|
||||||
import ConfigParser
|
import ConfigParser
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from email.header import Header
|
from email.header import Header
|
||||||
@ -112,19 +107,6 @@ def mail_single_entries(session, sender, receiver, prefix):
|
|||||||
else:
|
else:
|
||||||
print 'no unmailed single entries found... not sending mail.'
|
print 'no unmailed single entries found... not sending mail.'
|
||||||
|
|
||||||
def fetch_readability(link):
|
|
||||||
text = hn.upgradeLink(link)
|
|
||||||
text = text.decode('utf8')
|
|
||||||
return text
|
|
||||||
|
|
||||||
def fetch_full_page(link):
|
|
||||||
opener = urllib.FancyURLopener({})
|
|
||||||
response = opener.open(link)
|
|
||||||
html = response.read()
|
|
||||||
html = html.decode('utf8')
|
|
||||||
text = html2text.html2text(html)
|
|
||||||
return text.encode('latin-1', 'replace')
|
|
||||||
|
|
||||||
def process_feed_entry(session, feed, entry):
|
def process_feed_entry(session, feed, entry):
|
||||||
thisentry = session.query(Entry).\
|
thisentry = session.query(Entry).\
|
||||||
filter(Entry.title == entry.title).\
|
filter(Entry.title == entry.title).\
|
||||||
@ -137,45 +119,7 @@ def process_feed_entry(session, feed, entry):
|
|||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
print ' new entry <%s>' % entry.title
|
print ' new entry <%s>' % entry.title
|
||||||
thisentry = Entry(entry)
|
feed.entries.append(Entry(entry, feed))
|
||||||
if feed.resolveredirects:
|
|
||||||
print ' fetching final link <%s>' % entry.link
|
|
||||||
request = urllib2.Request(entry.link)
|
|
||||||
opener = urllib2.build_opener()
|
|
||||||
result = opener.open(request)
|
|
||||||
thisentry.resolvedlink = result.url
|
|
||||||
print ' final link: <%s>' % result.url
|
|
||||||
if feed.fullpage:
|
|
||||||
print ' fetching full page <%s>' % entry.link
|
|
||||||
thisentry.fullpage = fetch_full_page(entry.link)
|
|
||||||
if feed.readability:
|
|
||||||
print ' fetching readability <%s>' % entry.link
|
|
||||||
thisentry.readability = fetch_readability(entry.link)
|
|
||||||
if feed.html2textcontent:
|
|
||||||
print ' converting summary'
|
|
||||||
h2t = html2text.HTML2Text()
|
|
||||||
h2t.body_width = 0
|
|
||||||
h2t.inline_links = False
|
|
||||||
if feed.html2textignoreimages:
|
|
||||||
h2t.ignore_images = True
|
|
||||||
if feed.contentcolumn == 'summary':
|
|
||||||
thisentry.summary = h2t.handle(thisentry.summary)
|
|
||||||
elif feed.contentcolumn == 'content':
|
|
||||||
thisentry.content = h2t.handle(thisentry.content)
|
|
||||||
elif feed.contentcolumn == 'fullpage':
|
|
||||||
thisentry.fullpage = h2t.handle(thisentry.fullpage)
|
|
||||||
elif feed.contentcolumn == 'readability':
|
|
||||||
thisentry.readability = h2t.handle(thisentry.readability)
|
|
||||||
hp = HTMLParser.HTMLParser()
|
|
||||||
if thisentry.summary:
|
|
||||||
thisentry.summary = hp.unescape(thisentry.summary)
|
|
||||||
if thisentry.content:
|
|
||||||
thisentry.content = hp.unescape(thisentry.content)
|
|
||||||
if thisentry.fullpage:
|
|
||||||
thisentry.fullpage = hp.unescape(thisentry.fullpage)
|
|
||||||
if thisentry.readability:
|
|
||||||
thisentry.readability = hp.unescape(thisentry.readability)
|
|
||||||
feed.entries.append(thisentry)
|
|
||||||
session.commit()
|
session.commit()
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
102
models/entry.py
102
models/entry.py
@ -4,10 +4,34 @@
|
|||||||
from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime
|
from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from time import mktime
|
from time import mktime
|
||||||
import pprint
|
import urllib2
|
||||||
|
#import urllib
|
||||||
|
#import hn
|
||||||
|
import html2text
|
||||||
|
import HTMLParser
|
||||||
|
|
||||||
from models import Base
|
from models import Base
|
||||||
|
|
||||||
|
def fetch_readability(link):
|
||||||
|
text = hn.upgradeLink(link)
|
||||||
|
text = text.decode('utf8')
|
||||||
|
return text
|
||||||
|
|
||||||
|
def fetch_full_page(link):
|
||||||
|
opener = urllib.FancyURLopener({})
|
||||||
|
response = opener.open(link)
|
||||||
|
html = response.read()
|
||||||
|
html = html.decode('utf8')
|
||||||
|
text = html2text.html2text(html)
|
||||||
|
return text.encode('latin-1', 'replace')
|
||||||
|
|
||||||
|
def size_human_readable(bytesize):
|
||||||
|
for x in ['bytes','KB','MB','GB']:
|
||||||
|
if bytesize < 1024.0:
|
||||||
|
return "%3.1f%s" % (bytesize, x)
|
||||||
|
bytesize /= 1024.0
|
||||||
|
return "%3.1f%s" % (bytesize, 'TB')
|
||||||
|
|
||||||
class Entry(Base):
|
class Entry(Base):
|
||||||
__tablename__ = 'entry'
|
__tablename__ = 'entry'
|
||||||
|
|
||||||
@ -28,20 +52,7 @@ class Entry(Base):
|
|||||||
lastfetched = Column(DateTime)
|
lastfetched = Column(DateTime)
|
||||||
sent = Column(DateTime)
|
sent = Column(DateTime)
|
||||||
|
|
||||||
def __init__(self, entry):
|
def __init__(self, entry, feed):
|
||||||
self.update(entry)
|
|
||||||
self.firstfetched = datetime.now()
|
|
||||||
|
|
||||||
def __unicode__(self):
|
|
||||||
return u'%d -> %s' % (self.id, self.title)
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return unicode(self).encode('utf-8')
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "<Entry('%d','%s')>" % (self.id, self.title)
|
|
||||||
|
|
||||||
def update(self, entry):
|
|
||||||
if entry.has_key('title'):
|
if entry.has_key('title'):
|
||||||
self.title = entry.get('title')
|
self.title = entry.get('title')
|
||||||
if entry.has_key('link'):
|
if entry.has_key('link'):
|
||||||
@ -56,12 +67,63 @@ class Entry(Base):
|
|||||||
updated_parsed = entry.get('updated_parsed')
|
updated_parsed = entry.get('updated_parsed')
|
||||||
self.updated = datetime.fromtimestamp(mktime(updated_parsed))
|
self.updated = datetime.fromtimestamp(mktime(updated_parsed))
|
||||||
if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0:
|
if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0:
|
||||||
print 'enclosures';
|
enclosures = ''
|
||||||
pp=pprint.PrettyPrinter(depth=4)
|
for enclosure in entry.get('enclosures'):
|
||||||
pp.pprint(entry.get('enclosures'))
|
url = enclosure['href']
|
||||||
#self.enclosures = entry.get('enclosures')
|
length = size_human_readable(int(enclosure['length']))
|
||||||
|
if not enclosures == '':
|
||||||
|
enclosures += '\n'
|
||||||
|
enclosures += '%s (%s)' % (url, length)
|
||||||
|
self.enclosures = enclosures
|
||||||
|
if feed.resolveredirects:
|
||||||
|
print ' fetching final link <%s>' % entry.link
|
||||||
|
request = urllib2.Request(entry.link)
|
||||||
|
opener = urllib2.build_opener()
|
||||||
|
result = opener.open(request)
|
||||||
|
self.resolvedlink = result.url
|
||||||
|
print ' final link: <%s>' % result.url
|
||||||
|
if feed.fullpage:
|
||||||
|
print ' fetching full page <%s>' % entry.link
|
||||||
|
self.fullpage = fetch_full_page(entry.link)
|
||||||
|
if feed.readability:
|
||||||
|
print ' fetching readability <%s>' % entry.link
|
||||||
|
self.readability = fetch_readability(entry.link)
|
||||||
|
if feed.html2textcontent:
|
||||||
|
print ' converting summary'
|
||||||
|
h2t = html2text.HTML2Text()
|
||||||
|
h2t.body_width = 0
|
||||||
|
h2t.inline_links = False
|
||||||
|
if feed.html2textignoreimages:
|
||||||
|
h2t.ignore_images = True
|
||||||
|
if feed.contentcolumn == 'summary':
|
||||||
|
self.summary = h2t.handle(self.summary)
|
||||||
|
elif feed.contentcolumn == 'content':
|
||||||
|
self.content = h2t.handle(self.content)
|
||||||
|
elif feed.contentcolumn == 'fullpage':
|
||||||
|
self.fullpage = h2t.handle(self.fullpage)
|
||||||
|
elif feed.contentcolumn == 'readability':
|
||||||
|
self.readability = h2t.handle(self.readability)
|
||||||
|
hp = HTMLParser.HTMLParser()
|
||||||
|
if self.summary:
|
||||||
|
self.summary = hp.unescape(self.summary)
|
||||||
|
if self.content:
|
||||||
|
self.content = hp.unescape(self.content)
|
||||||
|
if self.fullpage:
|
||||||
|
self.fullpage = hp.unescape(self.fullpage)
|
||||||
|
if self.readability:
|
||||||
|
self.readability = hp.unescape(self.readability)
|
||||||
|
self.firstfetched = datetime.now()
|
||||||
self.lastfetched = datetime.now()
|
self.lastfetched = datetime.now()
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return u'%d -> %s' % (self.id, self.title)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return unicode(self).encode('utf-8')
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return "<Entry('%d','%s')>" % (self.id, self.title)
|
||||||
|
|
||||||
def get_text(self):
|
def get_text(self):
|
||||||
if self.feed.contentcolumn == 'summary':
|
if self.feed.contentcolumn == 'summary':
|
||||||
text = self.summary
|
text = self.summary
|
||||||
@ -71,5 +133,7 @@ class Entry(Base):
|
|||||||
text = self.fullpage
|
text = self.fullpage
|
||||||
elif self.feed.contentcolumn == 'readability':
|
elif self.feed.contentcolumn == 'readability':
|
||||||
text = self.readability
|
text = self.readability
|
||||||
|
if self.enclosures:
|
||||||
|
text += '\n\nEnclosures:\n%s' % self.enclosures
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user