diff --git a/wpull/document/html_test.py b/wpull/document/html_test.py index 4a297238..28a0169d 100644 --- a/wpull/document/html_test.py +++ b/wpull/document/html_test.py @@ -129,10 +129,7 @@ def test_html_encoding(self): elements = tuple(reader.iter_elements(data, encoding=name)) html_element = elements[0] - if isinstance(html_parser, LxmlHTMLParser): - self.assertEqual('html', html_element.tag) - else: - self.assertEqual('img', html_element.tag) + self.assertEqual('html', html_element.tag) def test_html_layout(self): html_parser = self.get_html_parser() @@ -160,13 +157,9 @@ def test_html_layout(self): self.assertEqual('body', elements[5].tag) self.assertEqual('img', elements[6].tag) - if isinstance(html_parser, LxmlHTMLParser): - self.assertEqual('img', elements[7].tag) - self.assertEqual('body', elements[8].tag) - self.assertEqual('html', elements[9].tag) - else: - self.assertEqual('body', elements[7].tag) - self.assertEqual('html', elements[8].tag) + self.assertEqual('img', elements[7].tag) + self.assertEqual('body', elements[8].tag) + self.assertEqual('html', elements[9].tag) def test_html_early_html(self): reader = HTMLReader(self.get_html_parser()) diff --git a/wpull/document/htmlparse/html5lib_.py b/wpull/document/htmlparse/html5lib_.py index 6f247437..8db791ef 100644 --- a/wpull/document/htmlparse/html5lib_.py +++ b/wpull/document/htmlparse/html5lib_.py @@ -1,6 +1,6 @@ '''Parsing using html5lib python.''' -import html5lib.constants -import html5lib.tokenizer +from html5lib.treewalkers.dom import TreeWalker +import html5lib import io import os.path @@ -8,14 +8,14 @@ from wpull.document.htmlparse.element import Comment, Doctype, Element -DOCTYPE = html5lib.constants.tokenTypes['Doctype'] -CHARACTERS = html5lib.constants.tokenTypes['Characters'] -SPACE_CHARACTERS = html5lib.constants.tokenTypes['SpaceCharacters'] -START_TAG = html5lib.constants.tokenTypes['StartTag'] -END_TAG = html5lib.constants.tokenTypes['EndTag'] -EMPTY_TAG = html5lib.constants.tokenTypes['EmptyTag'] -COMMENT = html5lib.constants.tokenTypes['Comment'] -PARSE_ERROR = html5lib.constants.tokenTypes['ParseError'] +class TreeWalkerAdapter(TreeWalker): + """ Simple adapter for TreeWalker. Splits up EmptyTag into start/end tag, + so the fragile logic of HTMLParser does not break """ + def emptyTag(self, namespace, name, attrs, hasChildren=False): + yield self.startTag(namespace, name, attrs) + if hasChildren: + yield self.error("Void element has children") + yield self.endTag(namespace, name) class HTMLParser(BaseParser): @@ -24,11 +24,10 @@ def parser_error(self): return ValueError def parse(self, file, encoding=None): - tokenizer = html5lib.tokenizer.HTMLTokenizer( - file, encoding=encoding, - useChardet=False if encoding else True, - parseMeta=False if encoding else True, - ) + tokenizer = TreeWalkerAdapter(html5lib.parse( + file, treebuilder='dom', + override_encoding=encoding, + )) tag = None attrib = None @@ -38,7 +37,7 @@ def parse(self, file, encoding=None): for token in tokenizer: token_type = token['type'] - if token_type == START_TAG: + if token_type == 'StartTag': if buffer: yield Element(tag, attrib, buffer.getvalue(), None, False) buffer = None @@ -48,19 +47,22 @@ def parse(self, file, encoding=None): tail_buffer = None tag = token['name'] - attrib = dict(token['data']) + # html5lib returns node names as ((namespace, name), value), + # but we expect just (name, value) pairs + attrib = dict(map(lambda x: (x[0][1], x[1]), token['data'].items())) buffer = io.StringIO() - if token['name'] == 'script': - tokenizer.state = tokenizer.scriptDataState + # XXX: ? + #if token['name'] == 'script': + # tokenizer.state = tokenizer.scriptDataState - elif token_type in (CHARACTERS, SPACE_CHARACTERS): + elif token_type in ('Characters', 'SpaceCharacters'): if buffer: buffer.write(token['data']) if tail_buffer: tail_buffer.write(token['data']) - elif token_type == END_TAG: + elif token_type == 'EndTag': if buffer: yield Element(tag, attrib, buffer.getvalue(), None, False) buffer = None @@ -72,12 +74,12 @@ def parse(self, file, encoding=None): tail_buffer = io.StringIO() tag = token['name'] - elif token_type == COMMENT: + elif token_type == 'Comment': yield Comment(token['data']) - elif token_type == DOCTYPE: + elif token_type == 'Doctype': yield Doctype('{} {} {}'.format( token['name'], token['publicId'], token['systemId'])) - elif token_type == PARSE_ERROR: + elif token_type == 'SerializeError': pass else: raise ValueError('Unhandled token {}'.format(token)) @@ -90,17 +92,17 @@ def parse(self, file, encoding=None): yield Element(tag, dict(), None, tail_buffer.getvalue(), True) tail_buffer = None - if __name__ == '__main__': path = os.path.join( os.path.dirname(__file__), '..', '..', 'testing', 'samples', 'xkcd_1.html' ) with open(path, 'rb') as in_file: - tokenizer = html5lib.tokenizer.HTMLTokenizer(in_file) + tokenizer = TreeWalkerAdapter(html5lib.parse(in_file, treebuilder='dom')) for token in tokenizer: print(token) html_parser = HTMLParser() for element in html_parser.parse(in_file): print(element) +