jenkins-bot submitted this change.

View Change

Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

[IMPR] Some improvements

- allow defusedxml to be used with xmlreader.py
- simplify assignments in dataextend.py
- always return a list within Analyzer.findclaims()
- validate server certificate's hostname in dataextend.py
- update ignore-names for pep8-naming to its default

Change-Id: I6d9c469450d64ba5e32ca696d1f3790d417adbdb
---
M pywikibot/xmlreader.py
M scripts/dataextend.py
M tox.ini
3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/pywikibot/xmlreader.py b/pywikibot/xmlreader.py
index 8277b97..69712c1 100644
--- a/pywikibot/xmlreader.py
+++ b/pywikibot/xmlreader.py
@@ -1,11 +1,14 @@
-"""
-XML reading module.
+"""XML reading module.
 
 Each XmlEntry object represents a page, as read from an XML source
 
 The XmlDump class reads a pages_current XML dump (like the ones offered on
 https://dumps.wikimedia.org/backup-index.html) and offers a generator over
 XmlEntry objects which can be used by other bots.
+
+.. versionchanged:: 7.7
+   *defusedxml* is used in favour of *xml.etree* if present to prevent
+   vulnerable XML attacks. *defusedxml* 0.7.1 or higher is recommended.
 """
 #
 # (C) Pywikibot team, 2005-2022
@@ -14,7 +17,11 @@
 #
 import re
 from typing import Optional
-from xml.etree.ElementTree import iterparse, ParseError
+
+try:
+    from defusedxml.ElementTree import iterparse, ParseError
+except ImportError:
+    from xml.etree.ElementTree import iterparse, ParseError
 
 from pywikibot.backports import Callable, Type
 from pywikibot.tools import open_archive
diff --git a/scripts/dataextend.py b/scripts/dataextend.py
index c234485..ac6b861 100644
--- a/scripts/dataextend.py
+++ b/scripts/dataextend.py
@@ -67,13 +67,13 @@
 from contextlib import suppress
 from html import unescape
 from textwrap import shorten
-from typing import Tuple
+from typing import Optional
 from urllib.error import HTTPError, URLError
 from urllib.parse import quote, unquote
 from urllib.request import urlopen
 
 import pywikibot
-from pywikibot.backports import List
+from pywikibot.backports import List, Tuple
 from pywikibot.bot import input_yn, SingleSiteBot, suggest_help
 from pywikibot.data import sparql
 from pywikibot.exceptions import (
@@ -1242,7 +1242,7 @@
         term = term.split('(')[0]
         if ',' in term:
             if term.split(',')[1].strip().lower() in ['jr', 'sr']:
-                term = term + '.'
+                term += '.'
             else:
                 if term.strip()[-1] != term.strip()[-1].lower():
                     term = term.strip() + '.'
@@ -1283,14 +1283,13 @@
             answer = None
         return answer
 
-    def findclaims(self):
-        if not self.id:
-            return
+    def findclaims(self) -> List[Tuple[str, str, Optional['Analyzer']]]:
+        if not self.id or not (self.url or self.sparqlquery):
+            return []
+
         self.html = ''
-        if not self.url and not self.sparqlquery:
-            return
         newclaims = []
-        pywikibot.output()
+        pywikibot.info()
         pagerequest = None
         if not self.skipfirst:
             try:
@@ -1316,7 +1315,6 @@
                 pywikibot.output('Unable to receive page {} - not unicode?'
                                  .format(self.url))
                 pagerequest = None
-                self.html = ''
 
         if pagerequest:
             pagebytes = pagerequest.read()
@@ -1330,6 +1328,8 @@
                 pywikibot.output('Getting {}'.format(extraurl))
                 if 'https' in self.url:
                     context = ssl._create_unverified_context()
+                    # validate server certificate's hostname is recommened
+                    context.check_hostname = True
                     pagerequest = urlopen(extraurl, context=context)
                 else:
                     pagerequest = urlopen(extraurl)
@@ -1338,14 +1338,15 @@
             else:
                 pagebytes = pagerequest.read()
                 try:
-                    self.html = self.html + '\n' + pagebytes.decode('utf-8')
+                    self.html += '\n' + pagebytes.decode('utf-8')
                 except UnicodeDecodeError:
-                    self.html = self.html + '\n' + str(pagebytes)
+                    self.html += '\n' + str(pagebytes)
 
         if self.sparqlquery:
             self.html = str(sparql.SparqlQuery().select(self.sparqlquery))
+
         if not self.html:
-            return
+            return []
 
         if self.escapeunicode:
             self.html = self.html.encode().decode('unicode-escape')
@@ -5722,11 +5723,11 @@
         section = self.findbyre(
             r'(?s)<div class="detail_label">Artistic Role\(s\):</div>\s*<div class="detail_text">(.*?)<', html)
         if section:
-            result = result + self.findallbyre(r'([^,]*)', section, 'occupation')
+            result += self.findallbyre(r'([^,]*)', section, 'occupation')
         section = self.findbyre(
             r'(?s)<div class="detail_label">Other Occupation\(s\):</div>\s*<div class="detail_text">(.*?)<', html)
         if section:
-            result = result + self.findallbyre(r'([^,]*)', section, 'occupation')
+            result += self.findallbyre(r'([^,]*)', section, 'occupation')
         return result
 
     def findresidences(self, html: str):
diff --git a/tox.ini b/tox.ini
index 3b35408..654e9aa 100644
--- a/tox.ini
+++ b/tox.ini
@@ -219,7 +219,7 @@
 # pep8-naming
 classmethod-decorators = classmethod,classproperty
 # required with pep8-naming < 0.13
-ignore-names = setUp,tearDown,setUpClass,tearDownClass,setUpModule,tearDownModule,maxDiff
+ignore-names = setUp,tearDown,setUpClass,tearDownClass,setUpModule,tearDownModule,asyncSetUp,asyncTearDown,setUpTestData,failureException,longMessage,maxDiff
 
 [isort]
 atomic = true

To view, visit change 830572. To unsubscribe, or for help writing mail filters, visit settings.