jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/614565 )
Change subject: [bugfix] Add title_delimiter_and_alias into family files ......................................................................
[bugfix] Add title_delimiter_and_alias into family files
Titles usually are delimited by a space and the the alias is replaced to this delimiter; e.g. "Main page" is the title with spaces as delimiters but "Main_page" also works. Other families like wikihow has a have setting.
- add title_delimiter_and_alias to family.py as default - add a different title_delimiter_and_alias to wikihow_family.py - use this settings for Page.Link and when comparing titles - tests updated
Bug: T294761 Change-Id: Ib7858b88324376b6bbbf788893308fcf66c4d154 --- M pywikibot/families/wikihow_family.py M pywikibot/family.py M pywikibot/page/__init__.py M pywikibot/site/_basesite.py M tests/link_tests.py 5 files changed, 61 insertions(+), 12 deletions(-)
Approvals: Xqt: Looks good to me, approved jenkins-bot: Verified
diff --git a/pywikibot/families/wikihow_family.py b/pywikibot/families/wikihow_family.py index 4aa93f7..38d3ab4 100644 --- a/pywikibot/families/wikihow_family.py +++ b/pywikibot/families/wikihow_family.py @@ -15,7 +15,7 @@
"""Family class for Wikihow Wiki.
- .. versionaddded: 3.0 + .. versionadded:: 3.0 """
name = 'wikihow' @@ -25,8 +25,12 @@ 'ar', 'cs', 'de', 'en', 'es', 'fr', 'hi', 'id', 'it', 'ja', 'ko', 'nl', 'pt', 'ru', 'th', 'tr', 'vi', 'zh', ) + removed_wikis = ['ca', 'cy', 'fa', 'he', 'pl', 'ur']
+ title_delimiter_and_aliases = '- ' + """.. versionadded:: 7.0""" + @classproperty def domains(cls): """List of domains used by family wikihow.""" diff --git a/pywikibot/family.py b/pywikibot/family.py index 006c548..d5240a9 100644 --- a/pywikibot/family.py +++ b/pywikibot/family.py @@ -542,6 +542,21 @@ # site. This value can specify this last one with (lang, family) tuple. shared_urlshortner_wiki = None # type: Optional[Tuple[str, str]]
+ title_delimiter_and_aliases = ' _' + """Titles usually are delimited by a space and the alias is replaced + to this delimiter; e.g. "Main page" is the title with spaces as + delimiters but "Main_page" also works. Other families may have + different settings. + + .. note:: The first character is used as delimiter, the others are + aliases. + + .. warning:: This attribute is used within ``re.sub()`` method. Use + escape sequence if necessary + + .. versionadded:: 7.0 + """ + _families = {}
@staticmethod diff --git a/pywikibot/page/__init__.py b/pywikibot/page/__init__.py index 59d5781..56841d6 100644 --- a/pywikibot/page/__init__.py +++ b/pywikibot/page/__init__.py @@ -5249,6 +5249,8 @@ else: self._anchor = None
+ self._text = self._text.strip() + # Convert URL-encoded characters to unicode self._text = pywikibot.tools.chars.url2string( self._text, encodings=self._source.encodings()) @@ -5267,9 +5269,11 @@ '{!r} contains illegal char {!r}'.format(t, '\ufffd'))
# Cleanup whitespace + sep = self._source.family.title_delimiter_and_aliases[0] t = re.sub( - '[_ \xa0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]+', - ' ', t) + '[{}\xa0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]+' + .format(self._source.family.title_delimiter_and_aliases), + sep, t) # Strip spaces at both ends t = t.strip() # Remove left-to-right and right-to-left markers. diff --git a/pywikibot/site/_basesite.py b/pywikibot/site/_basesite.py index 3a3a605..f92ae18 100644 --- a/pywikibot/site/_basesite.py +++ b/pywikibot/site/_basesite.py @@ -378,10 +378,14 @@ return default_ns, title return ns, name
- # Replace underscores with spaces and multiple combinations of them - # with only one space - title1 = re.sub(r'[_ ]+', ' ', title1) - title2 = re.sub(r'[_ ]+', ' ', title2) + # Replace alias characters like underscores with title + # delimiters like spaces and multiple combinations of them with + # only one delimiter + sep = self.family.title_delimiter_and_aliases[0] + pattern = re.compile('[{}]+' + .format(self.family.title_delimiter_and_aliases)) + title1 = pattern.sub(sep, title1) + title2 = pattern.sub(sep, title2) if title1 == title2: return True
diff --git a/tests/link_tests.py b/tests/link_tests.py index 220bebd..f8e9dbb 100644 --- a/tests/link_tests.py +++ b/tests/link_tests.py @@ -64,6 +64,19 @@ default site is using completely different namespaces. """
+ def replaced(self, iterable): + """Replace family specific title delimiter.""" + for items in iterable: + if isinstance(items, str): + items = [items] + items = [re.sub(' ', + self.site.family.title_delimiter_and_aliases[0], + item) + for item in items] + if len(items) == 1: + items = items[0] + yield items + def test_valid(self): """Test that valid titles are correctly normalized.""" title_tests = ['Sandbox', 'A "B"', "A 'B'", '.com', '~', '"', "'", @@ -87,11 +100,11 @@
site = self.get_site()
- for title in title_tests: + for title in self.replaced(title_tests): with self.subTest(title=title): self.assertEqual(Link(title, site).title, title)
- for link, title in extended_title_tests: + for link, title in self.replaced(extended_title_tests): with self.subTest(link=link, title=title): self.assertEqual(Link(link, site).title, title)
@@ -138,7 +151,7 @@
title_tests = [ # Empty title - (['', ':', '__ __', ' __ '], + (['', ':'], r'^The link [[.*]] does not contain a page title$'),
(['A [ B', 'A ] B', 'A { B', 'A } B', 'A < B', 'A > B'], @@ -165,12 +178,21 @@ ([('x' * 256), ('Invalid:' + 'X' * 248)], generate_overlength_exc_regex),
- (['Talk:', 'Category: ', 'Category: #bar'], + (['Talk:'], generate_has_no_title_exc_regex), ]
+ # Known issues with wikihow. + if self.site.family.name != 'wikihow': + title_tests.extend([ + (['Category: ', 'Category: #bar'], + generate_has_no_title_exc_regex), + (['__ __', ' __ '], + r'^The link [[]] does not contain a page title$'), + ]) + for texts_to_test, exception_regex in title_tests: - for text in texts_to_test: + for text in self.replaced(texts_to_test): with self.subTest(title=text): if callable(exception_regex): regex = exception_regex(text)
pywikibot-commits@lists.wikimedia.org