refactored processing of entries, enabled processing of enclosures
This commit is contained in:
		
							
								
								
									
										58
									
								
								atomstrom.py
									
									
									
									
									
								
							
							
						
						
									
										58
									
								
								atomstrom.py
									
									
									
									
									
								
							| @@ -10,11 +10,6 @@ from ddate import ddate | |||||||
| import feedparser | import feedparser | ||||||
| import sys | import sys | ||||||
| import codecs | import codecs | ||||||
| #import urllib |  | ||||||
| import urllib2 |  | ||||||
| #import hn |  | ||||||
| import html2text |  | ||||||
| import HTMLParser |  | ||||||
| import ConfigParser | import ConfigParser | ||||||
| from argparse import ArgumentParser | from argparse import ArgumentParser | ||||||
| from email.header import Header | from email.header import Header | ||||||
| @@ -112,19 +107,6 @@ def mail_single_entries(session, sender, receiver, prefix): | |||||||
|     else: |     else: | ||||||
|         print 'no unmailed single entries found... not sending mail.' |         print 'no unmailed single entries found... not sending mail.' | ||||||
|  |  | ||||||
| def fetch_readability(link): |  | ||||||
|     text = hn.upgradeLink(link) |  | ||||||
|     text = text.decode('utf8') |  | ||||||
|     return text |  | ||||||
|  |  | ||||||
| def fetch_full_page(link): |  | ||||||
|     opener = urllib.FancyURLopener({}) |  | ||||||
|     response = opener.open(link) |  | ||||||
|     html = response.read() |  | ||||||
|     html = html.decode('utf8') |  | ||||||
|     text = html2text.html2text(html) |  | ||||||
|     return text.encode('latin-1', 'replace') |  | ||||||
|  |  | ||||||
| def process_feed_entry(session, feed, entry): | def process_feed_entry(session, feed, entry): | ||||||
|     thisentry = session.query(Entry).\ |     thisentry = session.query(Entry).\ | ||||||
|                 filter(Entry.title == entry.title).\ |                 filter(Entry.title == entry.title).\ | ||||||
| @@ -137,45 +119,7 @@ def process_feed_entry(session, feed, entry): | |||||||
|         return 0 |         return 0 | ||||||
|     else: |     else: | ||||||
|         print '  new entry <%s>' % entry.title |         print '  new entry <%s>' % entry.title | ||||||
|         thisentry = Entry(entry) |         feed.entries.append(Entry(entry, feed)) | ||||||
|         if feed.resolveredirects: |  | ||||||
|             print '    fetching final link <%s>' % entry.link |  | ||||||
|             request = urllib2.Request(entry.link) |  | ||||||
|             opener = urllib2.build_opener() |  | ||||||
|             result = opener.open(request) |  | ||||||
|             thisentry.resolvedlink = result.url |  | ||||||
|             print '    final link: <%s>' % result.url |  | ||||||
|         if feed.fullpage: |  | ||||||
|             print '    fetching full page <%s>' % entry.link |  | ||||||
|             thisentry.fullpage = fetch_full_page(entry.link) |  | ||||||
|         if feed.readability: |  | ||||||
|             print '    fetching readability <%s>' % entry.link |  | ||||||
|             thisentry.readability = fetch_readability(entry.link) |  | ||||||
|         if feed.html2textcontent: |  | ||||||
|             print '    converting summary' |  | ||||||
|             h2t = html2text.HTML2Text() |  | ||||||
|             h2t.body_width = 0 |  | ||||||
|             h2t.inline_links = False |  | ||||||
|             if feed.html2textignoreimages: |  | ||||||
|                 h2t.ignore_images = True |  | ||||||
|             if feed.contentcolumn == 'summary': |  | ||||||
|                 thisentry.summary = h2t.handle(thisentry.summary) |  | ||||||
|             elif feed.contentcolumn == 'content': |  | ||||||
|                 thisentry.content = h2t.handle(thisentry.content) |  | ||||||
|             elif feed.contentcolumn == 'fullpage': |  | ||||||
|                 thisentry.fullpage = h2t.handle(thisentry.fullpage) |  | ||||||
|             elif feed.contentcolumn == 'readability': |  | ||||||
|                 thisentry.readability = h2t.handle(thisentry.readability) |  | ||||||
|         hp = HTMLParser.HTMLParser() |  | ||||||
|         if thisentry.summary: |  | ||||||
|             thisentry.summary = hp.unescape(thisentry.summary) |  | ||||||
|         if thisentry.content: |  | ||||||
|             thisentry.content = hp.unescape(thisentry.content) |  | ||||||
|         if thisentry.fullpage: |  | ||||||
|             thisentry.fullpage = hp.unescape(thisentry.fullpage) |  | ||||||
|         if thisentry.readability: |  | ||||||
|             thisentry.readability = hp.unescape(thisentry.readability) |  | ||||||
|         feed.entries.append(thisentry) |  | ||||||
|         session.commit() |         session.commit() | ||||||
|         return 1 |         return 1 | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										102
									
								
								models/entry.py
									
									
									
									
									
								
							
							
						
						
									
										102
									
								
								models/entry.py
									
									
									
									
									
								
							| @@ -4,10 +4,34 @@ | |||||||
| from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime | from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| from time import mktime | from time import mktime | ||||||
| import pprint | import urllib2 | ||||||
|  | #import urllib | ||||||
|  | #import hn | ||||||
|  | import html2text | ||||||
|  | import HTMLParser | ||||||
|  |  | ||||||
| from models import Base | from models import Base | ||||||
|  |  | ||||||
|  | def fetch_readability(link): | ||||||
|  |     text = hn.upgradeLink(link) | ||||||
|  |     text = text.decode('utf8') | ||||||
|  |     return text | ||||||
|  |  | ||||||
|  | def fetch_full_page(link): | ||||||
|  |     opener = urllib.FancyURLopener({}) | ||||||
|  |     response = opener.open(link) | ||||||
|  |     html = response.read() | ||||||
|  |     html = html.decode('utf8') | ||||||
|  |     text = html2text.html2text(html) | ||||||
|  |     return text.encode('latin-1', 'replace') | ||||||
|  |  | ||||||
|  | def size_human_readable(bytesize): | ||||||
|  |     for x in ['bytes','KB','MB','GB']: | ||||||
|  |         if bytesize < 1024.0: | ||||||
|  |             return "%3.1f%s" % (bytesize, x) | ||||||
|  |         bytesize /= 1024.0 | ||||||
|  |     return "%3.1f%s" % (bytesize, 'TB') | ||||||
|  |  | ||||||
| class Entry(Base): | class Entry(Base): | ||||||
|     __tablename__ = 'entry' |     __tablename__ = 'entry' | ||||||
|  |  | ||||||
| @@ -28,20 +52,7 @@ class Entry(Base): | |||||||
|     lastfetched = Column(DateTime) |     lastfetched = Column(DateTime) | ||||||
|     sent = Column(DateTime) |     sent = Column(DateTime) | ||||||
|  |  | ||||||
|     def __init__(self, entry): |     def __init__(self, entry, feed): | ||||||
|         self.update(entry) |  | ||||||
|         self.firstfetched = datetime.now() |  | ||||||
|  |  | ||||||
|     def __unicode__(self): |  | ||||||
|         return u'%d -> %s' % (self.id, self.title) |  | ||||||
|  |  | ||||||
|     def __str__(self): |  | ||||||
|         return unicode(self).encode('utf-8') |  | ||||||
|  |  | ||||||
|     def __repr__(self): |  | ||||||
|        return "<Entry('%d','%s')>" % (self.id, self.title) |  | ||||||
|  |  | ||||||
|     def update(self, entry): |  | ||||||
|         if entry.has_key('title'): |         if entry.has_key('title'): | ||||||
|             self.title = entry.get('title') |             self.title = entry.get('title') | ||||||
|         if entry.has_key('link'): |         if entry.has_key('link'): | ||||||
| @@ -56,12 +67,63 @@ class Entry(Base): | |||||||
|             updated_parsed = entry.get('updated_parsed') |             updated_parsed = entry.get('updated_parsed') | ||||||
|             self.updated = datetime.fromtimestamp(mktime(updated_parsed)) |             self.updated = datetime.fromtimestamp(mktime(updated_parsed)) | ||||||
|         if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0: |         if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0: | ||||||
|             print 'enclosures'; |             enclosures = '' | ||||||
|             pp=pprint.PrettyPrinter(depth=4) |             for enclosure in entry.get('enclosures'): | ||||||
|             pp.pprint(entry.get('enclosures')) |                 url = enclosure['href'] | ||||||
|             #self.enclosures = entry.get('enclosures') |                 length = size_human_readable(int(enclosure['length'])) | ||||||
|  |                 if not enclosures == '': | ||||||
|  |                     enclosures += '\n' | ||||||
|  |                 enclosures += '%s (%s)' % (url, length) | ||||||
|  |             self.enclosures = enclosures | ||||||
|  |         if feed.resolveredirects: | ||||||
|  |             print '    fetching final link <%s>' % entry.link | ||||||
|  |             request = urllib2.Request(entry.link) | ||||||
|  |             opener = urllib2.build_opener() | ||||||
|  |             result = opener.open(request) | ||||||
|  |             self.resolvedlink = result.url | ||||||
|  |             print '    final link: <%s>' % result.url | ||||||
|  |         if feed.fullpage: | ||||||
|  |             print '    fetching full page <%s>' % entry.link | ||||||
|  |             self.fullpage = fetch_full_page(entry.link) | ||||||
|  |         if feed.readability: | ||||||
|  |             print '    fetching readability <%s>' % entry.link | ||||||
|  |             self.readability = fetch_readability(entry.link) | ||||||
|  |         if feed.html2textcontent: | ||||||
|  |             print '    converting summary' | ||||||
|  |             h2t = html2text.HTML2Text() | ||||||
|  |             h2t.body_width = 0 | ||||||
|  |             h2t.inline_links = False | ||||||
|  |             if feed.html2textignoreimages: | ||||||
|  |                 h2t.ignore_images = True | ||||||
|  |             if feed.contentcolumn == 'summary': | ||||||
|  |                 self.summary = h2t.handle(self.summary) | ||||||
|  |             elif feed.contentcolumn == 'content': | ||||||
|  |                 self.content = h2t.handle(self.content) | ||||||
|  |             elif feed.contentcolumn == 'fullpage': | ||||||
|  |                 self.fullpage = h2t.handle(self.fullpage) | ||||||
|  |             elif feed.contentcolumn == 'readability': | ||||||
|  |                 self.readability = h2t.handle(self.readability) | ||||||
|  |         hp = HTMLParser.HTMLParser() | ||||||
|  |         if self.summary: | ||||||
|  |             self.summary = hp.unescape(self.summary) | ||||||
|  |         if self.content: | ||||||
|  |             self.content = hp.unescape(self.content) | ||||||
|  |         if self.fullpage: | ||||||
|  |             self.fullpage = hp.unescape(self.fullpage) | ||||||
|  |         if self.readability: | ||||||
|  |             self.readability = hp.unescape(self.readability) | ||||||
|  |         self.firstfetched = datetime.now() | ||||||
|         self.lastfetched = datetime.now() |         self.lastfetched = datetime.now() | ||||||
|  |  | ||||||
|  |     def __unicode__(self): | ||||||
|  |         return u'%d -> %s' % (self.id, self.title) | ||||||
|  |  | ||||||
|  |     def __str__(self): | ||||||
|  |         return unicode(self).encode('utf-8') | ||||||
|  |  | ||||||
|  |     def __repr__(self): | ||||||
|  |        return "<Entry('%d','%s')>" % (self.id, self.title) | ||||||
|  |  | ||||||
|     def get_text(self): |     def get_text(self): | ||||||
|         if self.feed.contentcolumn == 'summary': |         if self.feed.contentcolumn == 'summary': | ||||||
|             text = self.summary |             text = self.summary | ||||||
| @@ -71,5 +133,7 @@ class Entry(Base): | |||||||
|             text = self.fullpage |             text = self.fullpage | ||||||
|         elif self.feed.contentcolumn == 'readability': |         elif self.feed.contentcolumn == 'readability': | ||||||
|             text = self.readability |             text = self.readability | ||||||
|  |         if self.enclosures: | ||||||
|  |             text += '\n\nEnclosures:\n%s' % self.enclosures | ||||||
|         return text |         return text | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user