[Gerrit] ...core[master]: [IMPR] Some improvements - Pywikibot-commits

7 Sep 2022

jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/830572 )
Change subject: [IMPR] Some improvements
......................................................................
[IMPR] Some improvements
- allow defusedxml to be used with xmlreader.py
- simplify assignments in dataextend.py
- always return a list within Analyzer.findclaims()
- validate server certificate's hostname in dataextend.py
- update ignore-names for pep8-naming to its default
Change-Id: I6d9c469450d64ba5e32ca696d1f3790d417adbdb
---
M pywikibot/xmlreader.py
M scripts/dataextend.py
M tox.ini
3 files changed, 27 insertions(+), 19 deletions(-)
Approvals:
  Xqt: Looks good to me, approved
  jenkins-bot: Verified

diff --git a/pywikibot/xmlreader.py b/pywikibot/xmlreader.py
index 8277b97..69712c1 100644
--- a/pywikibot/xmlreader.py
+++ b/pywikibot/xmlreader.py
@@ -1,11 +1,14 @@
-"""
-XML reading module.
+"""XML reading module.
Each XmlEntry object represents a page, as read from an XML source
The XmlDump class reads a pages_current XML dump (like the ones offered on
 https://dumps.wikimedia.org/backup-index.html) and offers a generator over
 XmlEntry objects which can be used by other bots.
+
+.. versionchanged:: 7.7
+   *defusedxml* is used in favour of *xml.etree* if present to prevent
+   vulnerable XML attacks. *defusedxml* 0.7.1 or higher is recommended.
 """
 #
 # (C) Pywikibot team, 2005-2022
@@ -14,7 +17,11 @@
 #
 import re
 from typing import Optional
-from xml.etree.ElementTree import iterparse, ParseError
+
+try:
+    from defusedxml.ElementTree import iterparse, ParseError
+except ImportError:
+    from xml.etree.ElementTree import iterparse, ParseError
from pywikibot.backports import Callable, Type
 from pywikibot.tools import open_archive
diff --git a/scripts/dataextend.py b/scripts/dataextend.py
index c234485..ac6b861 100644
--- a/scripts/dataextend.py
+++ b/scripts/dataextend.py
@@ -67,13 +67,13 @@
 from contextlib import suppress
 from html import unescape
 from textwrap import shorten
-from typing import Tuple
+from typing import Optional
 from urllib.error import HTTPError, URLError
 from urllib.parse import quote, unquote
 from urllib.request import urlopen
import pywikibot
-from pywikibot.backports import List
+from pywikibot.backports import List, Tuple
 from pywikibot.bot import input_yn, SingleSiteBot, suggest_help
 from pywikibot.data import sparql
 from pywikibot.exceptions import (
@@ -1242,7 +1242,7 @@
         term = term.split('(')[0]
         if ',' in term:
             if term.split(',')[1].strip().lower() in ['jr', 'sr']:
-                term = term + '.'
+                term += '.'
             else:
                 if term.strip()[-1] != term.strip()[-1].lower():
                     term = term.strip() + '.'
@@ -1283,14 +1283,13 @@
             answer = None
         return answer
-    def findclaims(self):
-        if not self.id:
-            return
+    def findclaims(self) -> List[Tuple[str, str, Optional['Analyzer']]]:
+        if not self.id or not (self.url or self.sparqlquery):
+            return []
+
         self.html = ''
-        if not self.url and not self.sparqlquery:
-            return
         newclaims = []
-        pywikibot.output()
+        pywikibot.info()
         pagerequest = None
         if not self.skipfirst:
             try:
@@ -1316,7 +1315,6 @@
                 pywikibot.output('Unable to receive page {} - not unicode?'
                                  .format(self.url))
                 pagerequest = None
-                self.html = ''
if pagerequest:
             pagebytes = pagerequest.read()
@@ -1330,6 +1328,8 @@
                 pywikibot.output('Getting {}'.format(extraurl))
                 if 'https' in self.url:
                     context = ssl._create_unverified_context()
+                    # validate server certificate's hostname is recommened
+                    context.check_hostname = True
                     pagerequest = urlopen(extraurl, context=context)
                 else:
                     pagerequest = urlopen(extraurl)
@@ -1338,14 +1338,15 @@
             else:
                 pagebytes = pagerequest.read()
                 try:
-                    self.html = self.html + '\n' + pagebytes.decode('utf-8')
+                    self.html += '\n' + pagebytes.decode('utf-8')
                 except UnicodeDecodeError:
-                    self.html = self.html + '\n' + str(pagebytes)
+                    self.html += '\n' + str(pagebytes)
if self.sparqlquery:
             self.html = str(sparql.SparqlQuery().select(self.sparqlquery))
+
         if not self.html:
-            return
+            return []
if self.escapeunicode:
             self.html = self.html.encode().decode('unicode-escape')
@@ -5722,11 +5723,11 @@
         section = self.findbyre(
             r'(?s)<div class="detail_label">Artistic Role(s):</div>\s*<div class="detail_text">(.*?)<', html)
         if section:
-            result = result + self.findallbyre(r'([^,]*)', section, 'occupation')
+            result += self.findallbyre(r'([^,]*)', section, 'occupation')
         section = self.findbyre(
             r'(?s)<div class="detail_label">Other Occupation(s):</div>\s*<div class="detail_text">(.*?)<', html)
         if section:
-            result = result + self.findallbyre(r'([^,]*)', section, 'occupation')
+            result += self.findallbyre(r'([^,]*)', section, 'occupation')
         return result
def findresidences(self, html: str):
diff --git a/tox.ini b/tox.ini
index 3b35408..654e9aa 100644
--- a/tox.ini
+++ b/tox.ini
@@ -219,7 +219,7 @@
 # pep8-naming
 classmethod-decorators = classmethod,classproperty
 # required with pep8-naming < 0.13
-ignore-names = setUp,tearDown,setUpClass,tearDownClass,setUpModule,tearDownModule,maxDiff
+ignore-names = setUp,tearDown,setUpClass,tearDownClass,setUpModule,tearDownModule,asyncSetUp,asyncTearDown,setUpTestData,failureException,longMessage,maxDiff
[isort]
 atomic = true
-- 
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/830572
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings

Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I6d9c469450d64ba5e32ca696d1f3790d417adbdb
Gerrit-Change-Number: 830572
Gerrit-PatchSet: 6
Gerrit-Owner: Xqt info@gno.de
Gerrit-Reviewer: D3r1ck01 xsavitar.wiki@aol.com
Gerrit-Reviewer: Xqt info@gno.de
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged