jenkins-bot has submitted this change and it was merged.
Change subject: proofreadpage.py: add ProofreadPage.index property and other methods
......................................................................
proofreadpage.py: add ProofreadPage.index property and other methods
Add:
- ProofreadPage.index property to get Index page containing the page
- IndexPage.page_gen() to load pages related to an Index page in
specified page range (filters are available for quality level and
page existance)
- IndexPage.get_number() to get page number of a page
- IndexPage.pages() to get the list of pages in Index
Rename:
- IndexPage.get_page_from_number() to get_page()
bs4 is now mandatory for ProofreadPage if ProofreadPage.index is
used.
Added and cleaned up docstrings.
Added related tests.
Change-Id: I9dab8c2e75dc27fe87500eac3202f14553525a82
---
M pywikibot/proofreadpage.py
M tests/proofreadpage_tests.py
2 files changed, 238 insertions(+), 22 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
index c04951c..29458b3 100644
--- a/pywikibot/proofreadpage.py
+++ b/pywikibot/proofreadpage.py
@@ -7,6 +7,7 @@
This module includes objects:
* ProofreadPage(Page)
* FullHeader
+* IndexPage(Page)
"""
#
@@ -69,6 +70,12 @@
PROBLEMATIC = 2
PROOFREAD = 3
VALIDATED = 4
+ PROOFREAD_LEVELS = [WITHOUT_TEXT,
+ NOT_PROOFREAD,
+ PROBLEMATIC,
+ PROOFREAD,
+ VALIDATED,
+ ]
open_tag = '<noinclude>'
close_tag = '</noinclude>'
@@ -78,7 +85,7 @@
def __init__(self, source, title=''):
"""Instantiate a ProofreadPage object.
- Raises UnknownExtension if source Site has no ProofreadPage Extension.
+ @raise UnknownExtension: source Site has no ProofreadPage Extension.
"""
if not isinstance(source, pywikibot.site.BaseSite):
site = source.site
@@ -89,6 +96,67 @@
if self.namespace() != site.proofread_page_ns:
raise ValueError('Page %s must belong to %s namespace'
% (self.title(), ns))
+ # Ensure that constants are in line with Extension values.
+ if list(self.site.proofread_levels.keys()) != self.PROOFREAD_LEVELS:
+ raise ValueError('QLs do not match site values: %s != %s'
+ % (self.site.proofread_levels.keys(),
+ self.PROOFREAD_LEVELS))
+
+ @property
+ def index(self):
+ """Get the Index page which contains ProofreadPage.
+
+ To force reload, delete index and call it again.
+
+ Returns:
+ None: if ProofreadPage is linked to no or several Index pages
+ and no inerence can be done from titles.
+ IndexPage: if ProofreadPage is linked to one Index page.
+ """
+ if not hasattr(self, '_index'):
+ index_ns = self.site.proofread_index_ns
+ what_links_here = [IndexPage(page) for
+ page in self.getReferences(namespaces=index_ns)]
+
+ if not what_links_here:
+ self._index = (None, [])
+ elif len(what_links_here) == 1:
+ self._index = (what_links_here[0], [])
+ else:
+ self._index = (None, what_links_here)
+ # Try to infer names form page titles.
+ base, sep, num = self.title(withNamespace=False).rpartition('/')
+ if sep == '/':
+ for page in what_links_here:
+ if page.title(withNamespace=False) == base:
+ what_links_here.remove(page)
+ self._index = (page, what_links_here)
+ break
+
+ page, others = self._index
+ if others:
+ pywikibot.warning('Page %s is linked to several Index pages: %s.'
+ % (self, others))
+ if page:
+ pywikibot.warning(' %s selected as Index.' % page)
+ pywikibot.warning(' %s remaining.' % others)
+ elif not page:
+ pywikibot.warning('Page %s is not linked to any Index page.'
+ % self)
+
+ return page
+
+ @index.setter
+ def index(self, value):
+ if not isinstance(value, IndexPage):
+ raise ValueError('value %s must be a IndexPage object.'
+ % value)
+ self._index = (value, None)
+
+ @index.deleter
+ def index(self):
+ if hasattr(self, "_index"):
+ del self._index
def decompose(fn):
"""Decorator.
@@ -347,7 +415,15 @@
on de wikisource).
page label is the label associated with a page in the Index page.
- Raises UnknownExtension if source Site has no ProofreadPage Extension.
+ This class provides methods to get pages contained in Index page,
+ and relative page numbers and labels by means of several helper
+ functions.
+
+ It also providesa generator to pages contained in Index page, with
+ possibility to define range, filter by quality levels and page existance.
+
+ @raise UnknownExtension: source Site has no ProofreadPage Extension.
+ @raise ImportError: bs4 is not installed.
"""
# Check if BeautifulSoup is imported.
if isinstance(BeautifulSoup, ImportError):
@@ -415,6 +491,7 @@
title = a_tag.get('title')
page = ProofreadPage(self.site, title)
+ page.index = self # set index property for page
if page not in self._all_page_links:
raise pywikibot.Error('Page %s not recognised.' % page)
@@ -458,6 +535,55 @@
"""
return len(self._page_from_numbers)
+ def page_gen(self, start=1, end=None, filter_ql=None,
+ only_existing=False, content=True):
+ """Return a page generator which yields pages contained in Index page.
+
+ Range is [start ... end], extremes included.
+
+ @param start: first page, defaults to 1
+ @type start: int
+ @param end: num_pages if end is None
+ @type end: int
+ @param filter_ql: filters quality levels
+ if None: all but 'Without Text'.
+ @type filter_ql: list of ints (corresponding to ql constants
+ defined in ProofreadPage).
+ @param only_existing: yields only existing pages.
+ @type only_existing: bool
+ @param content: preload content.
+ @type content: bool
+ """
+ if end is None:
+ end = self.num_pages
+
+ if not ((1 <= start <= self.num_pages) and
+ (1 <= end <= self.num_pages) and
+ (start <= end)):
+ raise ValueError('start=%s, end=%s are not in valid range (%s, %s)'
+ % (start, end, 1, self.num_pages))
+
+ # All but 'Without Text'
+ if filter_ql is None:
+ filter_ql = list(self.site.proofread_levels.keys())
+ filter_ql.remove(ProofreadPage.WITHOUT_TEXT)
+
+ gen = (self.get_page(i) for i in range(start, end + 1))
+ if content:
+ gen = self.site.preloadpages(gen)
+ # Decorate and sort by page number because preloadpages does not
+ # guarantee order.
+ # TODO: remove if preloadpages will guarantee order.
+ gen = ((p, self.get_number(p)) for p in gen)
+ gen = (p[0] for p in sorted(gen, key=lambda x: x[1]))
+ # Filter by QL.
+ gen = (p for p in gen if p.ql in filter_ql)
+ # Yield only existing.
+ if only_existing:
+ gen = (p for p in gen if p.exists())
+
+ return gen
+
@check_if_cached
def get_label_from_page(self, page):
"""Return 'page label' for page.
@@ -486,7 +612,7 @@
try:
return self._labels_from_page_number[page_number]
except KeyError:
- raise KeyError('Page number ".../%s" not range.'
+ raise KeyError('Page number ".../%s" not in range.'
% page_number)
def _get_from_label(self, mapping_dict, label):
@@ -523,14 +649,26 @@
return self._get_from_label(self._pages_from_label, label)
@check_if_cached
- def get_page_from_number(self, page_number):
- """Return a page object from page number.
-
- @param page_number: int
- @return: page
- @rtype: page object
- """
+ def get_page(self, page_number):
+ """Return a page object from page number."""
try:
return self._page_from_numbers[page_number]
except KeyError:
raise KeyError('Invalid page number: %s.' % page_number)
+
+ @check_if_cached
+ def pages(self):
+ """Return the list of pages in Index, sorted by page number.
+
+ @return: list of pages
+ @rtype: list
+ """
+ return [self._page_from_numbers[i] for i in range(1, self.num_pages + 1)]
+
+ @check_if_cached
+ def get_number(self, page):
+ """Return a page number from page object."""
+ try:
+ return self._numbers_from_page[page]
+ except KeyError:
+ raise KeyError('Invalid page: %s.' % page)
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index 28dfb5f..e28d9a7 100644
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -86,6 +86,7 @@
valid = {
'title': 'Page:Popular Science Monthly Volume 1.djvu/12',
+ 'index': 'Index:Popular Science Monthly Volume 1.djvu',
'ql': 4,
'user': 'T. Mazzei',
'header': u"{{rh|2|''THE POPULAR SCIENCE MONTHLY.''}}",
@@ -94,6 +95,10 @@
existing_invalid = {
'title': 'Main Page',
+ }
+
+ existing_unlinked = {
+ 'title': 'Page:Pywikibot unlinked test page',
}
not_existing_invalid = {
@@ -203,6 +208,61 @@
@require_modules('bs4')
+class TestProofreadPageIndexProperty(TestCase):
+
+ """Test ProofreadPage index property."""
+
+ family = 'wikisource'
+ code = 'en'
+
+ cached = True
+
+ valid = {
+ 'title': 'Page:Popular Science Monthly Volume 1.djvu/12',
+ 'index': 'Index:Popular Science Monthly Volume 1.djvu',
+ }
+
+ existing_multilinked = {
+ 'title': 'Page:Pywikibot test page 1/1',
+ 'index_1': 'Index:Pywikibot test page 1',
+ 'index_2': 'Index:Pywikibot test page 2',
+ }
+
+ existing_unlinked = {
+ 'title': 'Page:Pywikibot unlinked test page',
+ }
+
+ def test_index(self):
+ """Test index property."""
+ # Page with Index.
+ page = ProofreadPage(self.site, self.valid['title'])
+ index_page = IndexPage(self.site, self.valid['index'])
+
+ # Test propery.
+ self.assertEqual(page.index, index_page)
+
+ # Test deleter
+ del page.index
+ self.assertFalse(hasattr(page, '_index'))
+ # Test setter
+ page.index = index_page
+ self.assertEqual(page.index, index_page)
+
+ # Page without Index.
+ page = ProofreadPage(self.site, self.existing_multilinked['title'])
+ index_page_1 = IndexPage(self.site, self.existing_multilinked['index_1'])
+ index_page_2 = IndexPage(self.site, self.existing_multilinked['index_2'])
+ self.assertEqual(page.index, index_page_1)
+ self.assertNotEqual(page.index, index_page_2)
+ self.assertEqual(page._index, (index_page_1, [index_page_2]))
+
+ # Page without Index.
+ page = ProofreadPage(self.site, self.existing_unlinked['title'])
+ self.assertIs(page.index, None)
+ self.assertEqual(page._index, (None, []))
+
+
+@require_modules('bs4')
class IndexPageTestCase(TestCase):
"""Run tests related to IndexPage ProofreadPage extension."""
@@ -270,7 +330,8 @@
self.assertEqual(page.namespace(), source.namespace)
-class TestBasePageMethodsIndexPage(IndexPageTestCase, BasePageMethodsTestBase):
+@require_modules('bs4')
+class TestBasePageMethodsIndexPage(BasePageMethodsTestBase):
"""Test behavior of ProofreadPage methods inherited from BasePage."""
@@ -439,19 +500,36 @@
# Error if label does not exists.
self.assertRaises(KeyError, index_page.get_page_from_label, 'dummy label')
- # Test consistency of page <-> numbers mapping on last page_set and
- # num_set used.
- for p in page_set:
- n = index_page._numbers_from_page[p]
- self.assertEqual(index_page._page_from_numbers[n], p)
+ # Test get_page.
for n in num_set:
- p = index_page._page_from_numbers[n]
- self.assertEqual(index_page._numbers_from_page[p], n)
+ p = index_page.get_page(n)
+ self.assertEqual(index_page.get_number(p), n)
- # Test get_page_from_number.
- for n in num_set:
- p = index_page.get_page_from_number(n)
- self.assertEqual(index_page._numbers_from_page[p], n)
+ # Test get_number.
+ for p in page_set:
+ n = index_page.get_number(p)
+ self.assertEqual(index_page.get_page(n), p)
+
+ def test_page_gen(self, key):
+ """Test Index page generator."""
+ data = self.sites[key]
+ num, title_num, label = data['get_label']
+
+ index_page = IndexPage(self.site, self.sites[key]['index'])
+ page_title = self.sites[key]['page'].format(title_num)
+ proofread_page = ProofreadPage(self.site, page_title)
+
+ # Check start/end limits.
+ self.assertRaises(ValueError, index_page.page_gen, -1, 2)
+ self.assertRaises(ValueError, index_page.page_gen, 1, -1)
+ self.assertRaises(ValueError, index_page.page_gen, 2, 1)
+
+ # Check quality filters.
+ gen = index_page.page_gen(num, num, filter_ql=range(5))
+ self.assertEqual(list(gen), [proofread_page])
+
+ gen = index_page.page_gen(num, num, filter_ql=[0])
+ self.assertEqual(list(gen), [])
if __name__ == '__main__':
--
To view, visit https://gerrit.wikimedia.org/r/243489
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I9dab8c2e75dc27fe87500eac3202f14553525a82
Gerrit-PatchSet: 19
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>
jenkins-bot has submitted this change and it was merged.
Change subject: [IMPROV] cosmetic_changes: Dynamic header fixer
......................................................................
[IMPROV] cosmetic_changes: Dynamic header fixer
Instead of iterating over the text for each of the allowed levels it could just
dynamically react on each header instance via a replacement method.
Also instead of using the %-notation which doesn't allow reusing indexed
parameters it can use `str.format` to generate the header wiki text.
Change-Id: I04a627e2d3f99ca093c1970a7ffa7d9b4f1630a1
---
M pywikibot/cosmetic_changes.py
1 file changed, 9 insertions(+), 8 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py
index 485769d..6b4af85 100755
--- a/pywikibot/cosmetic_changes.py
+++ b/pywikibot/cosmetic_changes.py
@@ -731,6 +731,10 @@
return text
def fixHtml(self, text):
+ def replace_header(match):
+ depth = int(match.group(1))
+ return r'{0} {1} {0}'.format('=' * depth, match.group(2))
+
# Everything case-insensitive (?i)
# Keep in mind that MediaWiki automatically converts <br> to <br />
exceptions = ['nowiki', 'comment', 'math', 'pre', 'source',
@@ -748,14 +752,11 @@
r'<hr \1 />',
exceptions)
# a header where only spaces are in the same line
- for level in range(1, 7):
- equals = '\\1%s \\2 %s\\3' % ("=" * level, "=" * level)
- text = textlib.replaceExcept(
- text,
- r'(?i)([\r\n]) *<h%d> *([^<]+?) *</h%d> *([\r\n])'
- % (level, level),
- r'%s' % equals,
- exceptions)
+ text = textlib.replaceExcept(
+ text,
+ r'(?i)(?<=[\r\n]) *<h([1-7])> *([^<]+?) *</h\1> *(?=[\r\n])',
+ replace_header,
+ exceptions)
# TODO: maybe we can make the bot replace <p> tags with \r\n's.
return text
--
To view, visit https://gerrit.wikimedia.org/r/244877
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I04a627e2d3f99ca093c1970a7ffa7d9b4f1630a1
Gerrit-PatchSet: 3
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>
jenkins-bot has submitted this change and it was merged.
Change subject: pagegenerators.py: titleregex as filter of other generators
......................................................................
pagegenerators.py: titleregex as filter of other generators
Implement -titleregex as a a filter, applying a regex to titles of
pages returned by the other page generators.
-start now defaults to ! if nothing is specified, instead of
asking for a page as today.
Functionality before this patch shall now be obtained using:
-start -titleregex:my_regex
Warning is emitted if grep/titleregex filters are specified but no
generators are requested.
Bug:T114015
Change-Id: I249c4ee61b89ea4042b08fff0a3dc4557170e6f4
---
M pywikibot/pagegenerators.py
M tests/pagegenerators_tests.py
2 files changed, 40 insertions(+), 30 deletions(-)
Approvals:
John Vandenberg: Looks good to me, approved
XZise: Looks good to me, but someone else must approve
jenkins-bot: Verified
diff --git a/pywikibot/pagegenerators.py b/pywikibot/pagegenerators.py
index eb6f583..ecffb9a 100644
--- a/pywikibot/pagegenerators.py
+++ b/pywikibot/pagegenerators.py
@@ -134,8 +134,9 @@
before -newpages.
If used with -recentchanges, efficiency is improved if
-namepace/ns is provided before -recentchanges.
- If used with -titleregex, -namepace/ns must be provided
- before -titleregex and shall contain only one value.
+
+ If used with -start, -namepace/ns shall contain only one
+ value.
-interwiki Work on the given page and all equivalent pages in other
languages. This can, for example, be used to fight
@@ -181,13 +182,21 @@
"-start:Template:!" will make the bot work on all pages
in the template namespace.
+ default value is start:!
+
-prefixindex Work on pages commencing with a common prefix.
-step:n When used with any other argument that specifies a set
of pages, only retrieve n pages at a time from the wiki
server.
--titleregex Work on titles that match the given regular expression.
+-titleregex A regular expression that needs to match the article title
+ otherwise the page won't be returned.
+ Multiple -titleregex:regexpr can be provided and the page will
+ be returned if title is matched by any of the regexpr
+ provided.
+ Case insensitive regular expressions will be used and
+ dot matches any character.
-transcludes Work on all pages that use a certain template.
Argument can also be given as "-transcludes:Title".
@@ -327,6 +336,7 @@
self.step = None
self.limit = None
self.articlefilter_list = []
+ self.titlefilter_list = []
self.claimfilter_list = []
self.intersect = False
self._site = site
@@ -396,6 +406,9 @@
if self.limit:
self.gens[i] = itertools.islice(self.gens[i], self.limit)
if len(self.gens) == 0:
+ if self.titlefilter_list or self.articlefilter_list:
+ pywikibot.warning(
+ 'grep/titleregex filters specified but no generators.')
return None
elif len(self.gens) == 1:
gensList = self.gens[0]
@@ -419,11 +432,15 @@
claim[0], claim[1],
claim[2], claim[3])
+ if self.titlefilter_list:
+ dupfiltergen = RegexFilterPageGenerator(
+ dupfiltergen, self.titlefilter_list)
+
if self.articlefilter_list:
- return RegexBodyFilterPageGenerator(
+ dupfiltergen = RegexBodyFilterPageGenerator(
PreloadingGenerator(dupfiltergen), self.articlefilter_list)
- else:
- return dupfiltergen
+
+ return dupfiltergen
def getCategoryGen(self, arg, recurse=False, content=False,
gen_func=None):
@@ -672,8 +689,7 @@
elif arg.startswith('-start'):
firstPageTitle = arg[7:]
if not firstPageTitle:
- firstPageTitle = pywikibot.input(
- u'At which page do you want to start?')
+ firstPageTitle = '!'
firstpagelink = pywikibot.Link(firstPageTitle,
self.site)
namespace = firstpagelink.namespace
@@ -739,18 +755,11 @@
gen = GoogleSearchPageGenerator(arg[8:])
elif arg.startswith('-titleregex'):
if len(arg) == 11:
- regex = pywikibot.input(u'What page names are you looking for?')
+ self.titlefilter_list.append(pywikibot.input(
+ 'What page names are you looking for?'))
else:
- regex = arg[12:]
- # partial workaround for bug T85389
- # to use -namespace/ns with -newpages, -ns must be given
- # before -titleregex, otherwise default namespace is 0.
- # allpages only accepts a single namespace, and will raise a
- # TypeError if self.namespaces contains more than one namespace.
- namespaces = self.namespaces or 0
- gen = RegexFilterPageGenerator(
- self.site.allpages(namespace=namespaces),
- regex)
+ self.titlefilter_list.append(arg[12:])
+ return True
elif arg.startswith('-grep'):
if len(arg) == 5:
self.articlefilter_list.append(pywikibot.input(
diff --git a/tests/pagegenerators_tests.py b/tests/pagegenerators_tests.py
index 1e8cd20..3f9cb4f 100755
--- a/tests/pagegenerators_tests.py
+++ b/tests/pagegenerators_tests.py
@@ -592,6 +592,7 @@
def test_regexfilter_default(self):
gf = pagegenerators.GeneratorFactory()
# Matches titles with the same two or more continous characters
+ self.assertTrue(gf.handleArg('-start'))
self.assertTrue(gf.handleArg('-titleregex:(.)\\1+'))
gf.handleArg('-limit:10')
gen = gf.getCombinedGenerator()
@@ -603,39 +604,39 @@
self.assertRegex(page.title().lower(), '(.)\\1+')
def test_regexfilter_ns_after(self):
- """Bug: T85389: -ns after -titleregex is ignored with a warning."""
gf = pagegenerators.GeneratorFactory()
+ self.assertTrue(gf.handleArg('-start'))
self.assertTrue(gf.handleArg('-titleregex:.*'))
gf.handleArg('-ns:1')
gf.handleArg('-limit:10')
gen = gf.getCombinedGenerator()
pages = list(gen)
- self.assertGreater(len(pages), 0)
self.assertLessEqual(len(pages), 10)
- self.assertPagesInNamespaces(pages, 0)
+ self.assertPagesInNamespaces(pages, 1)
- def test_regexfilter_ns_first(self):
+ def test_regexfilter_ns_before(self):
gf = pagegenerators.GeneratorFactory()
- # Workaround for Bug: T85389
- # Give -ns before -titleregex (as for -newpages)
+ self.assertTrue(gf.handleArg('-start'))
gf.handleArg('-ns:1')
self.assertTrue(gf.handleArg('-titleregex:.*'))
gf.handleArg('-limit:10')
gen = gf.getCombinedGenerator()
self.assertIsNotNone(gen)
pages = list(gen)
- self.assertGreater(len(pages), 0)
self.assertLessEqual(len(pages), 10)
self.assertPagesInNamespaces(pages, 1)
- def test_regexfilter_two_ns_first(self):
+ def test_allpages_with_two_ns(self):
+ """Test that allpages fails with two ns as parameter."""
gf = pagegenerators.GeneratorFactory()
+ self.assertTrue(gf.handleArg('-start'))
gf.handleArg('-ns:3,1')
- self.assertRaisesRegex(
+ # allpages only accepts a single namespace, and will raise a
+ # TypeError if self.namespaces contains more than one namespace.
+ self.assertRaises(
TypeError,
'allpages module does not support multiple namespaces',
- gf.handleArg,
- '-titleregex:.*')
+ gf.getCombinedGenerator)
def test_prefixing_default(self):
gf = pagegenerators.GeneratorFactory()
--
To view, visit https://gerrit.wikimedia.org/r/244817
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I249c4ee61b89ea4042b08fff0a3dc4557170e6f4
Gerrit-PatchSet: 4
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>
jenkins-bot has submitted this change and it was merged.
Change subject: [IMPROV] cosmetic_changes: merge similar regexes
......................................................................
[IMPROV] cosmetic_changes: merge similar regexes
Instead of having two pretty similar regexes it's in most cases possible to
combine them easily. This is also fixing “dash” into “pipe” in the comment
explaining how the external link fixer works.
It also adds the site parameter to each changed call.
Change-Id: I8513e34f873730ee9c92abb75c499f67cb3d86fc
(cherry picked from commit 20b2b6bcd7be359fa7007a0a5c224a8692eb88e6)
---
M pywikibot/cosmetic_changes.py
1 file changed, 14 insertions(+), 26 deletions(-)
Approvals:
Mpaa: Looks good to me, approved
jenkins-bot: Verified
diff --git a/pywikibot/cosmetic_changes.py b/pywikibot/cosmetic_changes.py
index 57a2c78..31de09d 100755
--- a/pywikibot/cosmetic_changes.py
+++ b/pywikibot/cosmetic_changes.py
@@ -712,17 +712,13 @@
# r'\[https?://%s\.%s\.org/wiki/(?P<link>\S+)\s+(?P<title>.+?)\s?\]'
# % (self.site.code, self.site.family.name),
# r'[[\g<link>|\g<title>]]', exceptions)
- # external link in double brackets
+ # external link in/starting with double brackets
text = textlib.replaceExcept(
text,
- r'\[\[(?P<url>https?://[^\]]+?)\]\]',
- r'[\g<url>]', exceptions)
- # external link starting with double bracket
- text = textlib.replaceExcept(text,
- r'\[\[(?P<url>https?://.+?)\]',
- r'[\g<url>]', exceptions)
- # external link and description separated by a dash, with
- # whitespace in front of the dash, so that it is clear that
+ r'\[\[(?P<url>https?://[^\]]+?)\]\]?',
+ r'[\g<url>]', exceptions, site=self.site)
+ # external link and description separated by a pipe, with
+ # whitespace in front of the pipe, so that it is clear that
# the dash is not a legitimate part of the URL.
text = textlib.replaceExcept(
text,
@@ -742,14 +738,10 @@
# Keep in mind that MediaWiki automatically converts <br> to <br />
exceptions = ['nowiki', 'comment', 'math', 'pre', 'source',
'startspace']
- text = textlib.replaceExcept(text, r'(?i)<b>(.*?)</b>', r"'''\1'''",
- exceptions)
- text = textlib.replaceExcept(text, r'(?i)<strong>(.*?)</strong>',
- r"'''\1'''", exceptions)
- text = textlib.replaceExcept(text, r'(?i)<i>(.*?)</i>', r"''\1''",
- exceptions)
- text = textlib.replaceExcept(text, r'(?i)<em>(.*?)</em>', r"''\1''",
- exceptions)
+ text = textlib.replaceExcept(text, r'(?i)<(b|strong)>(.*?)</\1>',
+ r"'''\2'''", exceptions, site=self.site)
+ text = textlib.replaceExcept(text, r'(?i)<(i|em)>(.*?)</\1>',
+ r"''\2''", exceptions, site=self.site)
# horizontal line without attributes in a single line
text = textlib.replaceExcept(text, r'(?i)([\r\n])<hr[ /]*>([\r\n])',
r'\1----\2', exceptions)
@@ -800,19 +792,15 @@
exceptions = ['nowiki', 'comment', 'math', 'pre', 'source',
'startspace', 'gallery', 'hyperlink', 'interwiki', 'link']
# change <number> ccm -> <number> cm³
- text = textlib.replaceExcept(text, r'(\d)\s* ccm',
- r'\1 ' + u'cm³', exceptions)
- text = textlib.replaceExcept(text,
- r'(\d)\s*ccm', r'\1 ' + u'cm³',
- exceptions)
+ text = textlib.replaceExcept(text, r'(\d)\s*(?: )?ccm',
+ r'\1 cm³', exceptions,
+ site=self.site)
# Solve wrong Nº sign with °C or °F
# additional exception requested on fr-wiki for this stuff
pattern = re.compile(u'«.*?»', re.UNICODE)
exceptions.append(pattern)
- text = textlib.replaceExcept(text, r'(\d)\s* ' + u'[º°]([CF])',
- r'\1 ' + u'°' + r'\2', exceptions)
- text = textlib.replaceExcept(text, r'(\d)\s*' + u'[º°]([CF])',
- r'\1 ' + u'°' + r'\2', exceptions)
+ text = textlib.replaceExcept(text, r'(\d)\s*(?: )?[º°]([CF])',
+ r'\1 °\2', exceptions, site=self.site)
text = textlib.replaceExcept(text, u'º([CF])', u'°' + r'\1',
exceptions)
return text
--
To view, visit https://gerrit.wikimedia.org/r/244660
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I8513e34f873730ee9c92abb75c499f67cb3d86fc
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: 2.0
Gerrit-Owner: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: John Vandenberg <jayvdb(a)gmail.com>
Gerrit-Reviewer: Ladsgroup <ladsgroup(a)gmail.com>
Gerrit-Reviewer: Mpaa <mpaa.wiki(a)gmail.com>
Gerrit-Reviewer: XZise <CommodoreFabianus(a)gmx.de>
Gerrit-Reviewer: jenkins-bot <>