Atomstrom/models/entry.py

# This file is part of Atomstrom
# Copyright (C) 2013  Ronald Schaten <ronald@schatenseite.de>
#
# Atomstrom is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program.  If not, see <http://www.gnu.org/licenses/>.

from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime
from datetime import datetime
from time import mktime
import urllib2
from readability.readability import Document
import html2text
import HTMLParser

from models import Base

def fetch_readability(link):
    h2t = html2text.HTML2Text()
    h2t.body_width = 0
    h2t.inline_links = False
    h2t.ignore_links = True
    h2t.ignore_images = True
    response = urllib2.urlopen(link)
    text = response.read()
    text = Document(text).summary()
    text = h2t.handle(text)
    return text

def fetch_full_page(link):
    response = urllib2.urlopen(link)
    html = response.read()
    html = html.decode('utf8')
    return html

def size_human_readable(bytesize):
    for x in ['bytes','KB','MB','GB']:
        if bytesize < 1024.0:
            return "%3.1f%s" % (bytesize, x)
        bytesize /= 1024.0
    return "%3.1f%s" % (bytesize, 'TB')

class Entry(Base):
    __tablename__ = 'entry'

    id = Column(Integer, primary_key=True)
    feed_id = Column(Integer, ForeignKey('feed.id'))
    title = Column(String(255))
    link = Column(String(255))
    summary = Column(Text)
    content = Column(Text)
    author = Column(String(255))
    enclosures = Column(Text)

    resolvedlink = Column(String(255))
    fullpage = Column(Text)
    readability = Column(Text)
    updated = Column(DateTime)
    firstfetched = Column(DateTime)
    lastfetched = Column(DateTime)
    sent = Column(DateTime)

    def __init__(self, entry, feed):
        if entry.has_key('title'):
            self.title = entry.get('title')
        if entry.has_key('link'):
            self.link = entry.get('link')
        if entry.has_key('summary'):
            self.summary = entry.get('summary')
        if entry.has_key('content'):
            self.content = entry.get('content')[0].value
            for content in entry.get('content'):
                if feed.html2textcontent and content.type == 'text/plain':
                    self.content = content.value
                if not feed.html2textcontent and content.type == 'text/html':
                    self.content = content.value
        if entry.has_key('author'):
            self.author = entry.get('author')
        if entry.has_key('updated_parsed'):
            updated_parsed = entry.get('updated_parsed')
            self.updated = datetime.fromtimestamp(mktime(updated_parsed))
        if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0:
            enclosures = ''
            for enclosure in entry.get('enclosures'):
                if not enclosures == '':
                    enclosures += '\n'
                enclosures += enclosure['href']
                if enclosure.has_key('length'):
                    enclosures += ' (%s)' % size_human_readable(int(enclosure['length']))
            self.enclosures = enclosures
        if feed.resolveredirects:
            print '    fetching final link <%s>' % entry.link
            request = urllib2.Request(entry.link)
            opener = urllib2.build_opener()
            try:
                result = opener.open(request)
                self.resolvedlink = result.url
                print '    final link: <%s>' % result.url
            except:
                print '    FAILED opening URL'
        if feed.fullpage:
            print '    fetching full page <%s>' % entry.link
            self.fullpage = fetch_full_page(entry.link)
        if feed.readability:
            print '    fetching readability <%s>' % entry.link
            self.readability = fetch_readability(entry.link)
        if feed.html2textcontent:
            print '    converting summary'
            h2t = html2text.HTML2Text()
            h2t.body_width = 0
            h2t.inline_links = False
            if feed.html2textignorelinks:
                h2t.ignore_links = True
            if feed.html2textignoreimages:
                h2t.ignore_images = True
            if feed.contentcolumn == 'summary' and self.summary:
                self.summary = h2t.handle(self.summary)
            elif feed.contentcolumn == 'content' and self.content:
                self.content = h2t.handle(self.content)
            elif feed.contentcolumn == 'fullpage' and self.fullpage:
                self.fullpage = h2t.handle(self.fullpage)
            elif feed.contentcolumn == 'readability' and self.readability:
                self.readability = h2t.handle(self.readability)
        hp = HTMLParser.HTMLParser()
        if self.summary:
            self.summary = hp.unescape(self.summary)
        if self.content:
            self.content = hp.unescape(self.content)
        if self.fullpage:
            self.fullpage = hp.unescape(self.fullpage)
        if self.readability:
            self.readability = hp.unescape(self.readability)
        self.firstfetched = datetime.now()
        self.lastfetched = datetime.now()

    def __unicode__(self):
        return u'%d -> %s' % (self.id, self.title)

    def __str__(self):
        return unicode(self).encode('utf-8')

    def __repr__(self):
       return "<Entry('%d','%s')>" % (self.id, self.title)

    def get_text(self, enclosures=True):
        text = ''
        if self.feed.contentcolumn == 'summary' and self.summary:
            text = self.summary
        elif self.feed.contentcolumn == 'content' and self.content:
            text = self.content
        elif self.feed.contentcolumn == 'fullpage' and self.fullpage:
            text = self.fullpage
        elif self.feed.contentcolumn == 'readability' and self.readability:
            text = self.readability
        if self.enclosures and enclosures:
            text += '\n\nEnclosures:\n%s' % self.enclosures
        return text

# -*- coding: utf-8 -*-
include license info in all source files 2013-04-17 21:24:05 +00:00			`# This file is part of Atomstrom`
			`# Copyright (C) 2013 Ronald Schaten <ronald@schatenseite.de>`
			`#`
			`# Atomstrom is free software: you can redistribute it and/or modify it under`
			`# the terms of the GNU General Public License as published by the Free Software`
			`# Foundation, either version 3 of the License, or (at your option) any later`
			`# version.`
			`#`
			`# This program is distributed in the hope that it will be useful, but WITHOUT`
			`# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS`
			`# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more`
			`# details.`
			`#`
			`# You should have received a copy of the GNU General Public License along with`
			`# this program. If not, see <http://www.gnu.org/licenses/>.`

refactored so the data model lives in its own module 2013-03-26 19:20:51 +00:00			`from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime`
			`from datetime import datetime`
			`from time import mktime`
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`import urllib2`
implemented newer version of readability-module 2013-04-15 21:33:24 +00:00			`from readability.readability import Document`
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`import html2text`
			`import HTMLParser`
refactored so the data model lives in its own module 2013-03-26 19:20:51 +00:00
			`from models import Base`

refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`def fetch_readability(link):`
implemented newer version of readability-module 2013-04-15 21:33:24 +00:00			`h2t = html2text.HTML2Text()`
			`h2t.body_width = 0`
			`h2t.inline_links = False`
			`h2t.ignore_links = True`
			`h2t.ignore_images = True`
			`response = urllib2.urlopen(link)`
			`text = response.read()`
			`text = Document(text).summary()`
			`text = h2t.handle(text)`
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`return text`

			`def fetch_full_page(link):`
enabled fetching of full pages 2013-04-11 15:10:39 +00:00			`response = urllib2.urlopen(link)`
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`html = response.read()`
			`html = html.decode('utf8')`
enabled fetching of full pages 2013-04-11 15:10:39 +00:00			`return html`
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00
			`def size_human_readable(bytesize):`
			`for x in ['bytes','KB','MB','GB']:`
			`if bytesize < 1024.0:`
			`return "%3.1f%s" % (bytesize, x)`
			`bytesize /= 1024.0`
			`return "%3.1f%s" % (bytesize, 'TB')`

refactored so the data model lives in its own module 2013-03-26 19:20:51 +00:00			`class Entry(Base):`
			`__tablename__ = 'entry'`

			`id = Column(Integer, primary_key=True)`
			`feed_id = Column(Integer, ForeignKey('feed.id'))`
			`title = Column(String(255))`
			`link = Column(String(255))`
			`summary = Column(Text)`
			`content = Column(Text)`
			`author = Column(String(255))`
			`enclosures = Column(Text)`

			`resolvedlink = Column(String(255))`
			`fullpage = Column(Text)`
			`readability = Column(Text)`
			`updated = Column(DateTime)`
			`firstfetched = Column(DateTime)`
			`lastfetched = Column(DateTime)`
			`sent = Column(DateTime)`

refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`def __init__(self, entry, feed):`
refactored so the data model lives in its own module 2013-03-26 19:20:51 +00:00			`if entry.has_key('title'):`
			`self.title = entry.get('title')`
			`if entry.has_key('link'):`
			`self.link = entry.get('link')`
			`if entry.has_key('summary'):`
			`self.summary = entry.get('summary')`
			`if entry.has_key('content'):`
			`self.content = entry.get('content')[0].value`
if multiple content-entries exist, select the plain- or html-one according to html2textcontent variable 2013-05-06 14:39:48 +00:00			`for content in entry.get('content'):`
			`if feed.html2textcontent and content.type == 'text/plain':`
			`self.content = content.value`
			`if not feed.html2textcontent and content.type == 'text/html':`
			`self.content = content.value`
refactored so the data model lives in its own module 2013-03-26 19:20:51 +00:00			`if entry.has_key('author'):`
			`self.author = entry.get('author')`
			`if entry.has_key('updated_parsed'):`
			`updated_parsed = entry.get('updated_parsed')`
			`self.updated = datetime.fromtimestamp(mktime(updated_parsed))`
			`if entry.has_key('enclosures') and len(entry.get('enclosures')) > 0:`
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`enclosures = ''`
			`for enclosure in entry.get('enclosures'):`
			`if not enclosures == '':`
			`enclosures += '\n'`
small performance improvement, error correction in handling of enclosures 2013-04-08 19:19:29 +00:00			`enclosures += enclosure['href']`
			`if enclosure.has_key('length'):`
			`enclosures += ' (%s)' % size_human_readable(int(enclosure['length']))`
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`self.enclosures = enclosures`
			`if feed.resolveredirects:`
			`print ' fetching final link <%s>' % entry.link`
			`request = urllib2.Request(entry.link)`
			`opener = urllib2.build_opener()`
prevent exception if unable to resolve final link 2013-04-09 10:39:12 +00:00			`try:`
			`result = opener.open(request)`
			`self.resolvedlink = result.url`
			`print ' final link: <%s>' % result.url`
			`except:`
			`print ' FAILED opening URL'`
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`if feed.fullpage:`
			`print ' fetching full page <%s>' % entry.link`
			`self.fullpage = fetch_full_page(entry.link)`
			`if feed.readability:`
			`print ' fetching readability <%s>' % entry.link`
			`self.readability = fetch_readability(entry.link)`
			`if feed.html2textcontent:`
			`print ' converting summary'`
			`h2t = html2text.HTML2Text()`
			`h2t.body_width = 0`
			`h2t.inline_links = False`
new option to ignore links on feeds 2013-04-09 10:23:05 +00:00			`if feed.html2textignorelinks:`
			`h2t.ignore_links = True`
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`if feed.html2textignoreimages:`
			`h2t.ignore_images = True`
fixed problem with empty entries 2013-04-08 20:47:22 +00:00			`if feed.contentcolumn == 'summary' and self.summary:`
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`self.summary = h2t.handle(self.summary)`
fixed problem with empty entries 2013-04-08 20:47:22 +00:00			`elif feed.contentcolumn == 'content' and self.content:`
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`self.content = h2t.handle(self.content)`
fixed problem with empty entries 2013-04-08 20:47:22 +00:00			`elif feed.contentcolumn == 'fullpage' and self.fullpage:`
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`self.fullpage = h2t.handle(self.fullpage)`
fixed problem with empty entries 2013-04-08 20:47:22 +00:00			`elif feed.contentcolumn == 'readability' and self.readability:`
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`self.readability = h2t.handle(self.readability)`
			`hp = HTMLParser.HTMLParser()`
			`if self.summary:`
			`self.summary = hp.unescape(self.summary)`
			`if self.content:`
			`self.content = hp.unescape(self.content)`
			`if self.fullpage:`
			`self.fullpage = hp.unescape(self.fullpage)`
			`if self.readability:`
			`self.readability = hp.unescape(self.readability)`
			`self.firstfetched = datetime.now()`
refactored so the data model lives in its own module 2013-03-26 19:20:51 +00:00			`self.lastfetched = datetime.now()`
minor change in db-structure, made mailtext column selectable 2013-04-02 22:06:06 +00:00
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`def __unicode__(self):`
			`return u'%d -> %s' % (self.id, self.title)`

			`def __str__(self):`
			`return unicode(self).encode('utf-8')`

			`def __repr__(self):`
			`return "<Entry('%d','%s')>" % (self.id, self.title)`

no enclosures in text for daily digests 2013-04-17 18:59:50 +00:00			`def get_text(self, enclosures=True):`
fixed problem with empty entries 2013-04-08 20:47:22 +00:00			`text = ''`
			`if self.feed.contentcolumn == 'summary' and self.summary:`
minor change in db-structure, made mailtext column selectable 2013-04-02 22:06:06 +00:00			`text = self.summary`
fixed problem with empty entries 2013-04-08 20:47:22 +00:00			`elif self.feed.contentcolumn == 'content' and self.content:`
minor change in db-structure, made mailtext column selectable 2013-04-02 22:06:06 +00:00			`text = self.content`
fixed problem with empty entries 2013-04-08 20:47:22 +00:00			`elif self.feed.contentcolumn == 'fullpage' and self.fullpage:`
minor change in db-structure, made mailtext column selectable 2013-04-02 22:06:06 +00:00			`text = self.fullpage`
fixed problem with empty entries 2013-04-08 20:47:22 +00:00			`elif self.feed.contentcolumn == 'readability' and self.readability:`
minor change in db-structure, made mailtext column selectable 2013-04-02 22:06:06 +00:00			`text = self.readability`
no enclosures in text for daily digests 2013-04-17 18:59:50 +00:00			`if self.enclosures and enclosures:`
refactored processing of entries, enabled processing of enclosures 2013-04-08 19:02:48 +00:00			`text += '\n\nEnclosures:\n%s' % self.enclosures`
minor change in db-structure, made mailtext column selectable 2013-04-02 22:06:06 +00:00			`return text`

cleanup 2013-04-09 21:39:02 +00:00			`# -- coding: utf-8 --`