jenkins-bot submitted this change.

View Change

Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
[IMPR] Some improvements

- allow defusedxml to be used with xmlreader.py
- simplify assignments in dataextend.py
- always return a list within Analyzer.findclaims()
- validate server certificate's hostname in dataextend.py
- update ignore-names for pep8-naming to its default

Change-Id: I6d9c469450d64ba5e32ca696d1f3790d417adbdb
---
M pywikibot/xmlreader.py
M scripts/dataextend.py
M tox.ini
3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/pywikibot/xmlreader.py b/pywikibot/xmlreader.py
index 8277b97..69712c1 100644
--- a/pywikibot/xmlreader.py
+++ b/pywikibot/xmlreader.py
@@ -1,11 +1,14 @@
-"""
-XML reading module.
+"""XML reading module.

Each XmlEntry object represents a page, as read from an XML source

The XmlDump class reads a pages_current XML dump (like the ones offered on
https://dumps.wikimedia.org/backup-index.html) and offers a generator over
XmlEntry objects which can be used by other bots.
+
+.. versionchanged:: 7.7
+ *defusedxml* is used in favour of *xml.etree* if present to prevent
+ vulnerable XML attacks. *defusedxml* 0.7.1 or higher is recommended.
"""
#
# (C) Pywikibot team, 2005-2022
@@ -14,7 +17,11 @@
#
import re
from typing import Optional
-from xml.etree.ElementTree import iterparse, ParseError
+
+try:
+ from defusedxml.ElementTree import iterparse, ParseError
+except ImportError:
+ from xml.etree.ElementTree import iterparse, ParseError

from pywikibot.backports import Callable, Type
from pywikibot.tools import open_archive
diff --git a/scripts/dataextend.py b/scripts/dataextend.py
index c234485..ac6b861 100644
--- a/scripts/dataextend.py
+++ b/scripts/dataextend.py
@@ -67,13 +67,13 @@
from contextlib import suppress
from html import unescape
from textwrap import shorten
-from typing import Tuple
+from typing import Optional
from urllib.error import HTTPError, URLError
from urllib.parse import quote, unquote
from urllib.request import urlopen

import pywikibot
-from pywikibot.backports import List
+from pywikibot.backports import List, Tuple
from pywikibot.bot import input_yn, SingleSiteBot, suggest_help
from pywikibot.data import sparql
from pywikibot.exceptions import (
@@ -1242,7 +1242,7 @@
term = term.split('(')[0]
if ',' in term:
if term.split(',')[1].strip().lower() in ['jr', 'sr']:
- term = term + '.'
+ term += '.'
else:
if term.strip()[-1] != term.strip()[-1].lower():
term = term.strip() + '.'
@@ -1283,14 +1283,13 @@
answer = None
return answer

- def findclaims(self):
- if not self.id:
- return
+ def findclaims(self) -> List[Tuple[str, str, Optional['Analyzer']]]:
+ if not self.id or not (self.url or self.sparqlquery):
+ return []
+
self.html = ''
- if not self.url and not self.sparqlquery:
- return
newclaims = []
- pywikibot.output()
+ pywikibot.info()
pagerequest = None
if not self.skipfirst:
try:
@@ -1316,7 +1315,6 @@
pywikibot.output('Unable to receive page {} - not unicode?'
.format(self.url))
pagerequest = None
- self.html = ''

if pagerequest:
pagebytes = pagerequest.read()
@@ -1330,6 +1328,8 @@
pywikibot.output('Getting {}'.format(extraurl))
if 'https' in self.url:
context = ssl._create_unverified_context()
+ # validate server certificate's hostname is recommened
+ context.check_hostname = True
pagerequest = urlopen(extraurl, context=context)
else:
pagerequest = urlopen(extraurl)
@@ -1338,14 +1338,15 @@
else:
pagebytes = pagerequest.read()
try:
- self.html = self.html + '\n' + pagebytes.decode('utf-8')
+ self.html += '\n' + pagebytes.decode('utf-8')
except UnicodeDecodeError:
- self.html = self.html + '\n' + str(pagebytes)
+ self.html += '\n' + str(pagebytes)

if self.sparqlquery:
self.html = str(sparql.SparqlQuery().select(self.sparqlquery))
+
if not self.html:
- return
+ return []

if self.escapeunicode:
self.html = self.html.encode().decode('unicode-escape')
@@ -5722,11 +5723,11 @@
section = self.findbyre(
r'(?s)<div class="detail_label">Artistic Role\(s\):</div>\s*<div class="detail_text">(.*?)<', html)
if section:
- result = result + self.findallbyre(r'([^,]*)', section, 'occupation')
+ result += self.findallbyre(r'([^,]*)', section, 'occupation')
section = self.findbyre(
r'(?s)<div class="detail_label">Other Occupation\(s\):</div>\s*<div class="detail_text">(.*?)<', html)
if section:
- result = result + self.findallbyre(r'([^,]*)', section, 'occupation')
+ result += self.findallbyre(r'([^,]*)', section, 'occupation')
return result

def findresidences(self, html: str):
diff --git a/tox.ini b/tox.ini
index 3b35408..654e9aa 100644
--- a/tox.ini
+++ b/tox.ini
@@ -219,7 +219,7 @@
# pep8-naming
classmethod-decorators = classmethod,classproperty
# required with pep8-naming < 0.13
-ignore-names = setUp,tearDown,setUpClass,tearDownClass,setUpModule,tearDownModule,maxDiff
+ignore-names = setUp,tearDown,setUpClass,tearDownClass,setUpModule,tearDownModule,asyncSetUp,asyncTearDown,setUpTestData,failureException,longMessage,maxDiff

[isort]
atomic = true

To view, visit change 830572. To unsubscribe, or for help writing mail filters, visit settings.

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I6d9c469450d64ba5e32ca696d1f3790d417adbdb
Gerrit-Change-Number: 830572
Gerrit-PatchSet: 6
Gerrit-Owner: Xqt <info@gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki@aol.com>
Gerrit-Reviewer: Xqt <info@gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged