implemented readability-support
This commit is contained in:
parent
3141f82df3
commit
e4fb328d48
@ -8,6 +8,7 @@ import feedparser
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import urllib
|
import urllib
|
||||||
|
import hn
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
|
||||||
@ -91,6 +92,7 @@ class Entry(Base):
|
|||||||
enclosures = Column(Text)
|
enclosures = Column(Text)
|
||||||
|
|
||||||
fullpage = Column(Text)
|
fullpage = Column(Text)
|
||||||
|
readability = Column(Text)
|
||||||
lastfetched = Column(DateTime)
|
lastfetched = Column(DateTime)
|
||||||
sent = Column(DateTime)
|
sent = Column(DateTime)
|
||||||
|
|
||||||
@ -125,6 +127,11 @@ session = Session()
|
|||||||
#session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1))
|
#session.add(Feed('http://www.heise.de/newsticker/heise-atom.xml', 1, 0, 0, 1))
|
||||||
#session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1))
|
#session.add(Feed('http://blog.schatenseite.de/feed/', 1, 0, 0, 1))
|
||||||
|
|
||||||
|
def fetch_readability(link):
|
||||||
|
text = hn.upgradeLink(link)
|
||||||
|
text = text.decode('utf8')
|
||||||
|
return text
|
||||||
|
|
||||||
def fetch_full_page(link):
|
def fetch_full_page(link):
|
||||||
opener = urllib.FancyURLopener({})
|
opener = urllib.FancyURLopener({})
|
||||||
response = opener.open(link)
|
response = opener.open(link)
|
||||||
@ -140,6 +147,8 @@ def process_feed_entry(feed, entry):
|
|||||||
thisentry = Entry(entry)
|
thisentry = Entry(entry)
|
||||||
if feed.fullpage == 1:
|
if feed.fullpage == 1:
|
||||||
thisentry.fullpage = fetch_full_page(entry.link)
|
thisentry.fullpage = fetch_full_page(entry.link)
|
||||||
|
if feed.readability == 1:
|
||||||
|
thisentry.readability = fetch_readability(entry.link)
|
||||||
feed.entry.append(thisentry)
|
feed.entry.append(thisentry)
|
||||||
return "+"
|
return "+"
|
||||||
|
|
||||||
|
231
hn.py
Normal file
231
hn.py
Normal file
@ -0,0 +1,231 @@
|
|||||||
|
"""
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
from xml.sax.saxutils import escape
|
||||||
|
|
||||||
|
import urllib, re, os, urlparse
|
||||||
|
import HTMLParser, feedparser
|
||||||
|
from BeautifulSoup import BeautifulSoup
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
import codecs
|
||||||
|
import sys
|
||||||
|
streamWriter = codecs.lookup('utf-8')[-1]
|
||||||
|
sys.stdout = streamWriter(sys.stdout)
|
||||||
|
|
||||||
|
|
||||||
|
HN_RSS_FEED = "http://news.ycombinator.com/rss"
|
||||||
|
|
||||||
|
NEGATIVE = re.compile("comment|meta|footer|footnote|foot")
|
||||||
|
POSITIVE = re.compile("post|hentry|entry|content|text|body|article")
|
||||||
|
PUNCTUATION = re.compile("""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]""")
|
||||||
|
|
||||||
|
|
||||||
|
def grabContent(link, html):
|
||||||
|
|
||||||
|
replaceBrs = re.compile("<br */? *>[ \r\n]*<br */? *>")
|
||||||
|
html = re.sub(replaceBrs, "</p><p>", html)
|
||||||
|
|
||||||
|
try:
|
||||||
|
soup = BeautifulSoup(html)
|
||||||
|
except HTMLParser.HTMLParseError:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# REMOVE SCRIPTS
|
||||||
|
for s in soup.findAll("script"):
|
||||||
|
s.extract()
|
||||||
|
|
||||||
|
allParagraphs = soup.findAll("p")
|
||||||
|
topParent = None
|
||||||
|
|
||||||
|
parents = []
|
||||||
|
for paragraph in allParagraphs:
|
||||||
|
|
||||||
|
parent = paragraph.parent
|
||||||
|
|
||||||
|
if (parent not in parents):
|
||||||
|
parents.append(parent)
|
||||||
|
parent.score = 0
|
||||||
|
|
||||||
|
if (parent.has_key("class")):
|
||||||
|
if (NEGATIVE.match(parent["class"])):
|
||||||
|
parent.score -= 50
|
||||||
|
if (POSITIVE.match(parent["class"])):
|
||||||
|
parent.score += 25
|
||||||
|
|
||||||
|
if (parent.has_key("id")):
|
||||||
|
if (NEGATIVE.match(parent["id"])):
|
||||||
|
parent.score -= 50
|
||||||
|
if (POSITIVE.match(parent["id"])):
|
||||||
|
parent.score += 25
|
||||||
|
|
||||||
|
if (parent.score == None):
|
||||||
|
parent.score = 0
|
||||||
|
|
||||||
|
innerText = paragraph.renderContents() #"".join(paragraph.findAll(text=True))
|
||||||
|
if (len(innerText) > 10):
|
||||||
|
parent.score += 1
|
||||||
|
|
||||||
|
parent.score += innerText.count(",")
|
||||||
|
|
||||||
|
for parent in parents:
|
||||||
|
if ((not topParent) or (parent.score > topParent.score)):
|
||||||
|
topParent = parent
|
||||||
|
|
||||||
|
if (not topParent):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# REMOVE LINK'D STYLES
|
||||||
|
styleLinks = soup.findAll("link", attrs={"type" : "text/css"})
|
||||||
|
for s in styleLinks:
|
||||||
|
s.extract()
|
||||||
|
|
||||||
|
# REMOVE ON PAGE STYLES
|
||||||
|
for s in soup.findAll("style"):
|
||||||
|
s.extract()
|
||||||
|
|
||||||
|
# CLEAN STYLES FROM ELEMENTS IN TOP PARENT
|
||||||
|
for ele in topParent.findAll(True):
|
||||||
|
del(ele['style'])
|
||||||
|
del(ele['class'])
|
||||||
|
|
||||||
|
killDivs(topParent)
|
||||||
|
clean(topParent, "form")
|
||||||
|
clean(topParent, "object")
|
||||||
|
clean(topParent, "iframe")
|
||||||
|
|
||||||
|
fixLinks(topParent, link)
|
||||||
|
|
||||||
|
return topParent.renderContents()
|
||||||
|
|
||||||
|
|
||||||
|
def fixLinks(parent, link):
|
||||||
|
tags = parent.findAll(True)
|
||||||
|
|
||||||
|
for t in tags:
|
||||||
|
if (t.has_key("href")):
|
||||||
|
t["href"] = urlparse.urljoin(link, t["href"])
|
||||||
|
if (t.has_key("src")):
|
||||||
|
t["src"] = urlparse.urljoin(link, t["src"])
|
||||||
|
|
||||||
|
|
||||||
|
def clean(top, tag, minWords=10000):
|
||||||
|
tags = top.findAll(tag)
|
||||||
|
|
||||||
|
for t in tags:
|
||||||
|
if (t.renderContents().count(" ") < minWords):
|
||||||
|
t.extract()
|
||||||
|
|
||||||
|
|
||||||
|
def killDivs(parent):
|
||||||
|
|
||||||
|
divs = parent.findAll("div")
|
||||||
|
for d in divs:
|
||||||
|
p = len(d.findAll("p"))
|
||||||
|
img = len(d.findAll("img"))
|
||||||
|
li = len(d.findAll("li"))
|
||||||
|
a = len(d.findAll("a"))
|
||||||
|
embed = len(d.findAll("embed"))
|
||||||
|
pre = len(d.findAll("pre"))
|
||||||
|
code = len(d.findAll("code"))
|
||||||
|
|
||||||
|
if (d.renderContents().count(",") < 10):
|
||||||
|
if ((pre == 0) and (code == 0)):
|
||||||
|
if ((img > p ) or (li > p) or (a > p) or (p == 0) or (embed > 0)):
|
||||||
|
d.extract()
|
||||||
|
|
||||||
|
|
||||||
|
def upgradeLink(link):
|
||||||
|
|
||||||
|
link = link.encode('utf-8')
|
||||||
|
|
||||||
|
if (not (link.startswith("http://news.ycombinator.com") or link.endswith(".pdf"))):
|
||||||
|
linkFile = "upgraded/" + re.sub(PUNCTUATION, "_", link)
|
||||||
|
if (os.path.exists(linkFile)):
|
||||||
|
return open(linkFile).read()
|
||||||
|
else:
|
||||||
|
content = ""
|
||||||
|
try:
|
||||||
|
html = urllib.urlopen(link).read()
|
||||||
|
content = grabContent(link, html)
|
||||||
|
filp = open(linkFile, "w")
|
||||||
|
filp.write(content)
|
||||||
|
filp.close()
|
||||||
|
except IOError:
|
||||||
|
pass
|
||||||
|
return content
|
||||||
|
else:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def upgradeFeed(feedUrl):
|
||||||
|
|
||||||
|
feedData = urllib.urlopen(feedUrl).read()
|
||||||
|
|
||||||
|
upgradedLinks = []
|
||||||
|
parsedFeed = feedparser.parse(feedData)
|
||||||
|
|
||||||
|
for entry in parsedFeed.entries:
|
||||||
|
upgradedLinks.append((entry, upgradeLink(entry.link)))
|
||||||
|
|
||||||
|
rss = """<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>Hacker News</title>
|
||||||
|
<link>http://news.ycombinator.com/</link>
|
||||||
|
<description>Links for the intellectually curious, ranked by readers.</description>
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
for entry, content in upgradedLinks:
|
||||||
|
rss += u"""
|
||||||
|
<item>
|
||||||
|
<title>%s</title>
|
||||||
|
<link>%s</link>
|
||||||
|
<comments>%s</comments>
|
||||||
|
<description>
|
||||||
|
<![CDATA[<a href="%s">Comments</a><br/>%s<br/><a href="%s">Comments</a>]]>
|
||||||
|
</description>
|
||||||
|
</item>
|
||||||
|
""" % (entry.title, escape(entry.link), escape(entry.comments), entry.comments, content.decode('utf-8'), entry.comments)
|
||||||
|
|
||||||
|
rss += """
|
||||||
|
</channel>
|
||||||
|
</rss>"""
|
||||||
|
|
||||||
|
|
||||||
|
return rss
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print upgradeFeed(HN_RSS_FEED)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user