implemented newer version of readability-module

This commit is contained in:
2013-04-15 23:33:24 +02:00
parent 8eb6ed2a6d
commit b6d1e705f0
8 changed files with 797 additions and 234 deletions

View File

@ -2,15 +2,22 @@ from sqlalchemy import Column, Integer, ForeignKey, String, Text, DateTime
from datetime import datetime
from time import mktime
import urllib2
#import hn
from readability.readability import Document
import html2text
import HTMLParser
from models import Base
def fetch_readability(link):
text = hn.upgradeLink(link)
text = text.decode('utf8')
h2t = html2text.HTML2Text()
h2t.body_width = 0
h2t.inline_links = False
h2t.ignore_links = True
h2t.ignore_images = True
response = urllib2.urlopen(link)
text = response.read()
text = Document(text).summary()
text = h2t.handle(text)
return text
def fetch_full_page(link):