diff --git a/goose/parsers.py b/goose/parsers.py index a43e9b47..f5374976 100644 --- a/goose/parsers.py +++ b/goose/parsers.py @@ -26,6 +26,7 @@ from copy import deepcopy from goose.text import innerTrim from goose.text import encodeValue +import re class Parser(object): @@ -51,7 +52,12 @@ def css_select(self, node, selector): @classmethod def fromstring(self, html): html = encodeValue(html) + + # remove tag because it breaks the lxml html parser + html = re.sub(r'<\?xml version\=[\"\'][0-9]\.[0-9][\"\'] encoding\=(.*?)\?>', '', html) + self.doc = lxml.html.fromstring(html) + return self.doc @classmethod