jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/362967 )
Change subject: proofreadpage.py: Fallback to "html.parser" if "lxml"
is not available
......................................................................
proofreadpage.py: Fallback to "html.parser" if "lxml" is not
available
Bug: T169515
Change-Id: I8c6a3741b54b9a09dcdd263cbf59fad91fa9923a
---
M pywikibot/proofreadpage.py
1 file changed, 13 insertions(+), 6 deletions(-)
Approvals:
jenkins-bot: Verified
Xqt: Looks good to me, approved
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index 13dbce3..a49e657 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -26,16 +26,23 @@
__version__ = '$Id$'
+from functools import partial
import json
import re
try:
- from bs4 import BeautifulSoup
+ from bs4 import BeautifulSoup, FeatureNotFound
except ImportError as e:
BeautifulSoup = e
+else:
+ try:
+ BeautifulSoup('', 'lxml')
+ except FeatureNotFound:
+ Soup = partial(BeautifulSoup, features='html.parser')
+ else:
+ Soup = partial(BeautifulSoup, features='lxml')
import pywikibot
-
from pywikibot.comms import http
from pywikibot.data.api import Request
@@ -522,7 +529,7 @@
pywikibot.error('Error fetching HTML for %s.' % self)
raise
- soup = BeautifulSoup(response.content, 'lxml')
+ soup = Soup(response.content)
try:
# None if nothing is found by .find()
@@ -577,7 +584,7 @@
"""Do hocr using
//tools.wmflabs.org/phetools/hocr_cgi.py?cmd=hocr."""
def parse_hocr_text(txt):
"""Parse hocr text."""
- soup = BeautifulSoup(txt, 'lxml')
+ soup = Soup(txt)
res = []
for ocr_page in soup.find_all(class_='ocr_page'):
@@ -743,7 +750,7 @@
del self._parsed_text
self._parsed_text = self._get_parsed_page()
- self._soup = BeautifulSoup(self._parsed_text, 'html.parser')
+ self._soup = Soup(self._parsed_text)
# Do not search for "new" here, to avoid to skip purging if links
# to non-existing pages are present.
attrs = {'class': re.compile('prp-pagequality')}
@@ -765,7 +772,7 @@
self.purge()
del self._parsed_text
self._parsed_text = self._get_parsed_page()
- self._soup = BeautifulSoup(self._parsed_text, 'html.parser')
+ self._soup = Soup(self._parsed_text)
if not self._soup.find_all('a', attrs=attrs):
raise ValueError(
'Missing class="qualityN prp-pagequality-N" or '
--
To view, visit
https://gerrit.wikimedia.org/r/362967
To unsubscribe, visit
https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I8c6a3741b54b9a09dcdd263cbf59fad91fa9923a
Gerrit-PatchSet: 6
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Dalba <dalba.wiki(a)gmail.com>
Gerrit-Reviewer: Dalba <dalba.wiki(a)gmail.com>
Gerrit-Reviewer: Magul <tomasz.magulski(a)gmail.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot <>