implemented readability-support

2010-10-30 00:21:24 +02:00
parent 3141f82df3
commit e4fb328d48
2 changed files with 240 additions and 0 deletions
--- a/atomstrom.py
+++ b/atomstrom.py
@@ -8,6 +8,7 @@ import feedparser
 import re
 import sys
 import urllib
+import hn

 Base = declarative_base()

@@ -91,6 +92,7 @@ class Entry(Base):
    enclosures = Column(Text)

    fullpage = Column(Text)
+    readability = Column(Text)
    lastfetched = Column(DateTime)
    sent = Column(DateTime)

@@ -125,6 +127,11 @@ session = Session()
 #session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1))
 #session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1))

+def fetch_readability(link):
+    text = hn.upgradeLink(link)
+    text = text.decode('utf8')
+    return text
+
 def fetch_full_page(link):
    opener = urllib.FancyURLopener({})
    response = opener.open(link)
@@ -140,6 +147,8 @@ def process_feed_entry(feed, entry):
        thisentry = Entry(entry)
        if feed.fullpage == 1:
            thisentry.fullpage = fetch_full_page(entry.link)
+        if feed.readability == 1:
+            thisentry.readability = fetch_readability(entry.link)
        feed.entry.append(thisentry)
        return "+"

--- a/hn.py
+++ b/hn.py
@@ -0,0 +1,231 @@
+"""
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+
+from xml.sax.saxutils import escape
+
+import urllib, re, os, urlparse
+import HTMLParser, feedparser
+from BeautifulSoup import BeautifulSoup
+from pprint import pprint
+
+import codecs
+import sys
+streamWriter = codecs.lookup('utf-8')[-1]
+sys.stdout = streamWriter(sys.stdout)
+
+
+HN_RSS_FEED = "http://news.ycombinator.com/rss"
+
+NEGATIVE    = re.compile("comment|meta|footer|footnote|foot")
+POSITIVE    = re.compile("post|hentry|entry|content|text|body|article")
+PUNCTUATION = re.compile("""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]""")
+
+
+def grabContent(link, html):
+    
+    replaceBrs = re.compile("<br */? *>[ \r\n]*<br */? *>")
+    html = re.sub(replaceBrs, "</p><p>", html)
+    
+    try:
+        soup = BeautifulSoup(html)
+    except HTMLParser.HTMLParseError:
+        return ""
+    
+    # REMOVE SCRIPTS
+    for s in soup.findAll("script"):
+        s.extract()
+    
+    allParagraphs = soup.findAll("p")
+    topParent     = None
+    
+    parents = []
+    for paragraph in allParagraphs:
+        
+        parent = paragraph.parent
+        
+        if (parent not in parents):
+            parents.append(parent)
+            parent.score = 0
+            
+            if (parent.has_key("class")):
+                if (NEGATIVE.match(parent["class"])):
+                    parent.score -= 50
+                if (POSITIVE.match(parent["class"])):
+                    parent.score += 25
+                    
+            if (parent.has_key("id")):
+                if (NEGATIVE.match(parent["id"])):
+                    parent.score -= 50
+                if (POSITIVE.match(parent["id"])):
+                    parent.score += 25
+
+        if (parent.score == None):
+            parent.score = 0
+        
+        innerText = paragraph.renderContents() #"".join(paragraph.findAll(text=True))
+        if (len(innerText) > 10):
+            parent.score += 1
+            
+        parent.score += innerText.count(",")
+        
+    for parent in parents:
+        if ((not topParent) or (parent.score > topParent.score)):
+            topParent = parent
+
+    if (not topParent):
+        return ""
+            
+    # REMOVE LINK'D STYLES
+    styleLinks = soup.findAll("link", attrs={"type" : "text/css"})
+    for s in styleLinks:
+        s.extract()
+
+    # REMOVE ON PAGE STYLES
+    for s in soup.findAll("style"):
+        s.extract()
+
+    # CLEAN STYLES FROM ELEMENTS IN TOP PARENT
+    for ele in topParent.findAll(True):
+        del(ele['style'])
+        del(ele['class'])
+        
+    killDivs(topParent)
+    clean(topParent, "form")
+    clean(topParent, "object")
+    clean(topParent, "iframe")
+    
+    fixLinks(topParent, link)
+    
+    return topParent.renderContents()
+    
+
+def fixLinks(parent, link):
+    tags = parent.findAll(True)
+    
+    for t in tags:
+        if (t.has_key("href")):
+            t["href"] = urlparse.urljoin(link, t["href"])
+        if (t.has_key("src")):
+            t["src"] = urlparse.urljoin(link, t["src"])
+
+
+def clean(top, tag, minWords=10000):
+    tags = top.findAll(tag)
+
+    for t in tags:
+        if (t.renderContents().count(" ") < minWords):
+            t.extract()
+
+
+def killDivs(parent):
+    
+    divs = parent.findAll("div")
+    for d in divs:
+        p     = len(d.findAll("p"))
+        img   = len(d.findAll("img"))
+        li    = len(d.findAll("li"))
+        a     = len(d.findAll("a"))
+        embed = len(d.findAll("embed"))
+        pre   = len(d.findAll("pre"))
+        code  = len(d.findAll("code"))
+    
+        if (d.renderContents().count(",") < 10):
+            if ((pre == 0) and (code == 0)):
+                if ((img > p ) or (li > p) or (a > p) or (p == 0) or (embed > 0)):
+                    d.extract()
+    
+
+def upgradeLink(link):
+    
+    link = link.encode('utf-8')
+    
+    if (not (link.startswith("http://news.ycombinator.com") or link.endswith(".pdf"))):
+        linkFile = "upgraded/" + re.sub(PUNCTUATION, "_", link)
+        if (os.path.exists(linkFile)):
+            return open(linkFile).read()
+        else:
+            content = ""
+            try:
+                html = urllib.urlopen(link).read()
+                content = grabContent(link, html)
+                filp = open(linkFile, "w")
+                filp.write(content)
+                filp.close()
+            except IOError:
+                pass
+            return content
+    else:
+        return ""
+    
+    
+
+def upgradeFeed(feedUrl):
+    
+    feedData = urllib.urlopen(feedUrl).read()
+    
+    upgradedLinks = []
+    parsedFeed = feedparser.parse(feedData)
+    
+    for entry in parsedFeed.entries:
+        upgradedLinks.append((entry, upgradeLink(entry.link)))
+        
+    rss = """<rss version="2.0">
+<channel>
+	<title>Hacker News</title>
+	<link>http://news.ycombinator.com/</link>
+	<description>Links for the intellectually curious, ranked by readers.</description>
+	
+    """
+
+    for entry, content in upgradedLinks:
+        rss += u"""
+    <item>
+        <title>%s</title>
+        <link>%s</link>
+        <comments>%s</comments>
+        <description>
+            <![CDATA[<a href="%s">Comments</a><br/>%s<br/><a href="%s">Comments</a>]]>
+        </description>
+    </item>
+""" % (entry.title, escape(entry.link), escape(entry.comments), entry.comments, content.decode('utf-8'), entry.comments)
+
+    rss += """
+</channel>
+</rss>"""
+
+
+    return rss
+    
+if __name__ == "__main__":  
+    print upgradeFeed(HN_RSS_FEED)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+