jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/830572 )
Change subject: [IMPR] Some improvements ......................................................................
[IMPR] Some improvements
- allow defusedxml to be used with xmlreader.py - simplify assignments in dataextend.py - always return a list within Analyzer.findclaims() - validate server certificate's hostname in dataextend.py - update ignore-names for pep8-naming to its default
Change-Id: I6d9c469450d64ba5e32ca696d1f3790d417adbdb --- M pywikibot/xmlreader.py M scripts/dataextend.py M tox.ini 3 files changed, 27 insertions(+), 19 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/xmlreader.py b/pywikibot/xmlreader.py index 8277b97..69712c1 100644 --- a/pywikibot/xmlreader.py +++ b/pywikibot/xmlreader.py @@ -1,11 +1,14 @@ -""" -XML reading module. +"""XML reading module.
Each XmlEntry object represents a page, as read from an XML source
The XmlDump class reads a pages_current XML dump (like the ones offered on https://dumps.wikimedia.org/backup-index.html) and offers a generator over XmlEntry objects which can be used by other bots. + +.. versionchanged:: 7.7 + *defusedxml* is used in favour of *xml.etree* if present to prevent + vulnerable XML attacks. *defusedxml* 0.7.1 or higher is recommended. """ # # (C) Pywikibot team, 2005-2022 @@ -14,7 +17,11 @@ # import re from typing import Optional -from xml.etree.ElementTree import iterparse, ParseError + +try: + from defusedxml.ElementTree import iterparse, ParseError +except ImportError: + from xml.etree.ElementTree import iterparse, ParseError
from pywikibot.backports import Callable, Type from pywikibot.tools import open_archive diff --git a/scripts/dataextend.py b/scripts/dataextend.py index c234485..ac6b861 100644 --- a/scripts/dataextend.py +++ b/scripts/dataextend.py @@ -67,13 +67,13 @@ from contextlib import suppress from html import unescape from textwrap import shorten -from typing import Tuple +from typing import Optional from urllib.error import HTTPError, URLError from urllib.parse import quote, unquote from urllib.request import urlopen
import pywikibot -from pywikibot.backports import List +from pywikibot.backports import List, Tuple from pywikibot.bot import input_yn, SingleSiteBot, suggest_help from pywikibot.data import sparql from pywikibot.exceptions import ( @@ -1242,7 +1242,7 @@ term = term.split('(')[0] if ',' in term: if term.split(',')[1].strip().lower() in ['jr', 'sr']: - term = term + '.' + term += '.' else: if term.strip()[-1] != term.strip()[-1].lower(): term = term.strip() + '.' @@ -1283,14 +1283,13 @@ answer = None return answer
- def findclaims(self): - if not self.id: - return + def findclaims(self) -> List[Tuple[str, str, Optional['Analyzer']]]: + if not self.id or not (self.url or self.sparqlquery): + return [] + self.html = '' - if not self.url and not self.sparqlquery: - return newclaims = [] - pywikibot.output() + pywikibot.info() pagerequest = None if not self.skipfirst: try: @@ -1316,7 +1315,6 @@ pywikibot.output('Unable to receive page {} - not unicode?' .format(self.url)) pagerequest = None - self.html = ''
if pagerequest: pagebytes = pagerequest.read() @@ -1330,6 +1328,8 @@ pywikibot.output('Getting {}'.format(extraurl)) if 'https' in self.url: context = ssl._create_unverified_context() + # validate server certificate's hostname is recommened + context.check_hostname = True pagerequest = urlopen(extraurl, context=context) else: pagerequest = urlopen(extraurl) @@ -1338,14 +1338,15 @@ else: pagebytes = pagerequest.read() try: - self.html = self.html + '\n' + pagebytes.decode('utf-8') + self.html += '\n' + pagebytes.decode('utf-8') except UnicodeDecodeError: - self.html = self.html + '\n' + str(pagebytes) + self.html += '\n' + str(pagebytes)
if self.sparqlquery: self.html = str(sparql.SparqlQuery().select(self.sparqlquery)) + if not self.html: - return + return []
if self.escapeunicode: self.html = self.html.encode().decode('unicode-escape') @@ -5722,11 +5723,11 @@ section = self.findbyre( r'(?s)<div class="detail_label">Artistic Role(s):</div>\s*<div class="detail_text">(.*?)<', html) if section: - result = result + self.findallbyre(r'([^,]*)', section, 'occupation') + result += self.findallbyre(r'([^,]*)', section, 'occupation') section = self.findbyre( r'(?s)<div class="detail_label">Other Occupation(s):</div>\s*<div class="detail_text">(.*?)<', html) if section: - result = result + self.findallbyre(r'([^,]*)', section, 'occupation') + result += self.findallbyre(r'([^,]*)', section, 'occupation') return result
def findresidences(self, html: str): diff --git a/tox.ini b/tox.ini index 3b35408..654e9aa 100644 --- a/tox.ini +++ b/tox.ini @@ -219,7 +219,7 @@ # pep8-naming classmethod-decorators = classmethod,classproperty # required with pep8-naming < 0.13 -ignore-names = setUp,tearDown,setUpClass,tearDownClass,setUpModule,tearDownModule,maxDiff +ignore-names = setUp,tearDown,setUpClass,tearDownClass,setUpModule,tearDownModule,asyncSetUp,asyncTearDown,setUpTestData,failureException,longMessage,maxDiff
[isort] atomic = true
pywikibot-commits@lists.wikimedia.org