diff --git a/wpull/document/html_test.py b/wpull/document/html_test.py
index 4a297238..28a0169d 100644
--- a/wpull/document/html_test.py
+++ b/wpull/document/html_test.py
@@ -129,10 +129,7 @@ def test_html_encoding(self):
elements = tuple(reader.iter_elements(data, encoding=name))
html_element = elements[0]
- if isinstance(html_parser, LxmlHTMLParser):
- self.assertEqual('html', html_element.tag)
- else:
- self.assertEqual('img', html_element.tag)
+ self.assertEqual('html', html_element.tag)
def test_html_layout(self):
html_parser = self.get_html_parser()
@@ -160,13 +157,9 @@ def test_html_layout(self):
self.assertEqual('body', elements[5].tag)
self.assertEqual('img', elements[6].tag)
- if isinstance(html_parser, LxmlHTMLParser):
- self.assertEqual('img', elements[7].tag)
- self.assertEqual('body', elements[8].tag)
- self.assertEqual('html', elements[9].tag)
- else:
- self.assertEqual('body', elements[7].tag)
- self.assertEqual('html', elements[8].tag)
+ self.assertEqual('img', elements[7].tag)
+ self.assertEqual('body', elements[8].tag)
+ self.assertEqual('html', elements[9].tag)
def test_html_early_html(self):
reader = HTMLReader(self.get_html_parser())
diff --git a/wpull/document/htmlparse/html5lib_.py b/wpull/document/htmlparse/html5lib_.py
index 6f247437..8db791ef 100644
--- a/wpull/document/htmlparse/html5lib_.py
+++ b/wpull/document/htmlparse/html5lib_.py
@@ -1,6 +1,6 @@
'''Parsing using html5lib python.'''
-import html5lib.constants
-import html5lib.tokenizer
+from html5lib.treewalkers.dom import TreeWalker
+import html5lib
import io
import os.path
@@ -8,14 +8,14 @@
from wpull.document.htmlparse.element import Comment, Doctype, Element
-DOCTYPE = html5lib.constants.tokenTypes['Doctype']
-CHARACTERS = html5lib.constants.tokenTypes['Characters']
-SPACE_CHARACTERS = html5lib.constants.tokenTypes['SpaceCharacters']
-START_TAG = html5lib.constants.tokenTypes['StartTag']
-END_TAG = html5lib.constants.tokenTypes['EndTag']
-EMPTY_TAG = html5lib.constants.tokenTypes['EmptyTag']
-COMMENT = html5lib.constants.tokenTypes['Comment']
-PARSE_ERROR = html5lib.constants.tokenTypes['ParseError']
+class TreeWalkerAdapter(TreeWalker):
+ """ Simple adapter for TreeWalker. Splits up EmptyTag into start/end tag,
+ so the fragile logic of HTMLParser does not break """
+ def emptyTag(self, namespace, name, attrs, hasChildren=False):
+ yield self.startTag(namespace, name, attrs)
+ if hasChildren:
+ yield self.error("Void element has children")
+ yield self.endTag(namespace, name)
class HTMLParser(BaseParser):
@@ -24,11 +24,10 @@ def parser_error(self):
return ValueError
def parse(self, file, encoding=None):
- tokenizer = html5lib.tokenizer.HTMLTokenizer(
- file, encoding=encoding,
- useChardet=False if encoding else True,
- parseMeta=False if encoding else True,
- )
+ tokenizer = TreeWalkerAdapter(html5lib.parse(
+ file, treebuilder='dom',
+ override_encoding=encoding,
+ ))
tag = None
attrib = None
@@ -38,7 +37,7 @@ def parse(self, file, encoding=None):
for token in tokenizer:
token_type = token['type']
- if token_type == START_TAG:
+ if token_type == 'StartTag':
if buffer:
yield Element(tag, attrib, buffer.getvalue(), None, False)
buffer = None
@@ -48,19 +47,22 @@ def parse(self, file, encoding=None):
tail_buffer = None
tag = token['name']
- attrib = dict(token['data'])
+ # html5lib returns node names as ((namespace, name), value),
+ # but we expect just (name, value) pairs
+ attrib = dict(map(lambda x: (x[0][1], x[1]), token['data'].items()))
buffer = io.StringIO()
- if token['name'] == 'script':
- tokenizer.state = tokenizer.scriptDataState
+ # XXX: ?
+ #if token['name'] == 'script':
+ # tokenizer.state = tokenizer.scriptDataState
- elif token_type in (CHARACTERS, SPACE_CHARACTERS):
+ elif token_type in ('Characters', 'SpaceCharacters'):
if buffer:
buffer.write(token['data'])
if tail_buffer:
tail_buffer.write(token['data'])
- elif token_type == END_TAG:
+ elif token_type == 'EndTag':
if buffer:
yield Element(tag, attrib, buffer.getvalue(), None, False)
buffer = None
@@ -72,12 +74,12 @@ def parse(self, file, encoding=None):
tail_buffer = io.StringIO()
tag = token['name']
- elif token_type == COMMENT:
+ elif token_type == 'Comment':
yield Comment(token['data'])
- elif token_type == DOCTYPE:
+ elif token_type == 'Doctype':
yield Doctype('{} {} {}'.format(
token['name'], token['publicId'], token['systemId']))
- elif token_type == PARSE_ERROR:
+ elif token_type == 'SerializeError':
pass
else:
raise ValueError('Unhandled token {}'.format(token))
@@ -90,17 +92,17 @@ def parse(self, file, encoding=None):
yield Element(tag, dict(), None, tail_buffer.getvalue(), True)
tail_buffer = None
-
if __name__ == '__main__':
path = os.path.join(
os.path.dirname(__file__), '..', '..',
'testing', 'samples', 'xkcd_1.html'
)
with open(path, 'rb') as in_file:
- tokenizer = html5lib.tokenizer.HTMLTokenizer(in_file)
+ tokenizer = TreeWalkerAdapter(html5lib.parse(in_file, treebuilder='dom'))
for token in tokenizer:
print(token)
html_parser = HTMLParser()
for element in html_parser.parse(in_file):
print(element)
+