jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/669388 )
Change subject: [bugfix] data attribut is no longer supported with requests.Response
......................................................................
[bugfix] data attribut is no longer supported with requests.Response
Change-Id: I7cc3966335f98b760c7e8f148504c51a79e99ece
---
M scripts/reflinks.py
1 file changed, 15 insertions(+), 23 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index 7a4bb14..a27847b 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -462,16 +462,12 @@
pywikibot.stdout('HTTP error ({}) for {} on {}'
.format(err_num, link, pagetitleaslink))
- def getPDFTitle(self, ref, f):
- """Use pdfinfo to retrieve title from a PDF.
-
- FIXME: Unix-only, I'm afraid.
-
- """
+ def getPDFTitle(self, ref, response):
+ """Use pdfinfo to retrieve title from a PDF."""
pywikibot.output('PDF file.')
fd, infile = tempfile.mkstemp()
urlobj = os.fdopen(fd, 'w+')
- urlobj.write(f.text)
+ urlobj.write(response.text)
try:
pdfinfo_out = subprocess.Popen([r'pdfinfo', '/dev/stdin'],
@@ -535,16 +531,16 @@
ref = RefLink(link, match.group('name'), site=self.site)
try:
- f = comms.http.fetch(
+ r = comms.http.fetch(
ref.url, use_fake_user_agent=self._use_fake_user_agent)
# Try to get Content-Type from server
- content_type = f.headers.get('content-type')
+ content_type = r.headers.get('content-type')
if content_type and not self.MIME.search(content_type):
if ref.link.lower().endswith('.pdf') \
and not self.opt.ignorepdf:
# If file has a PDF suffix
- self.getPDFTitle(ref, f)
+ self.getPDFTitle(ref, r)
else:
pywikibot.output(color_format(
'{lightyellow}WARNING{default} : media : {} ',
@@ -566,7 +562,7 @@
continue
# Get the real url where we end (http redirects !)
- redir = f.url
+ redir = r.url
if redir != ref.link \
and domain.findall(redir) == domain.findall(link):
if soft404.search(redir) \
@@ -583,21 +579,21 @@
'Redirect to root : {0} ', ref.link))
continue
- if f.status_code != codes.ok:
+ if r.status_code != codes.ok:
pywikibot.stdout('HTTP error ({}) for {} on {}'
- .format(f.status_code, ref.url,
+ .format(r.status_code, ref.url,
page.title(as_link=True)))
# 410 Gone, indicates that the resource has been
# purposely removed
- if f.status_code == 410 \
- or (f.status_code == 404
+ if r.status_code == 410 \
+ or (r.status_code == 404
and '\t{}\t'.format(
ref.url) in self.dead_links):
repl = ref.refDead()
new_text = new_text.replace(match.group(), repl)
continue
- linkedpagetext = f.content
+ linkedpagetext = r.content
except UnicodeError:
# example:
@@ -636,15 +632,13 @@
# use charset from html
s = self.CHARSET.search(tag)
if s:
+ # Use encoding if found. Else use chardet apparent encoding
encoding = s.group('enc').strip('"\' ').lower()
naked = re.sub(r'[ _\-]', '', encoding)
# Convert to python correct encoding names
if naked == 'xeucjp':
encoding = 'euc_jp'
- f.data.encoding = encoding
- else:
- pywikibot.output('No charset found for ' + ref.link)
- f.data.encoding = None
+ r.encoding = encoding
if not content_type:
pywikibot.output('No content-type found for ' + ref.link)
@@ -658,10 +652,8 @@
new_text = new_text.replace(match.group(), repl)
continue
- u = f.text
-
# Retrieves the first non empty string inside <title> tags
- for m in self.TITLE.finditer(u):
+ for m in self.TITLE.finditer(r.text):
t = m.group()
if t:
ref.title = t
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/669388
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I7cc3966335f98b760c7e8f148504c51a79e99ece
Gerrit-Change-Number: 669388
Gerrit-PatchSet: 1
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki(a)aol.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/669360 )
Change subject: [IMPR] Simplify arg parsing and update documentation in reflinks.py
......................................................................
[IMPR] Simplify arg parsing and update documentation in reflinks.py
Change-Id: Ie5f58a5680491656919eb5c57ec01230fcad79d3
---
M scripts/reflinks.py
1 file changed, 26 insertions(+), 35 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index fe12b3e..15b9be5 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -2,19 +2,17 @@
"""
Fetch and add titles for bare links in references.
-This bot will search for references which are only made of a link without title
-(i.e. <ref>[https://www.google.fr/]</ref> or <ref>https://www.google.fr/</ref>)
-and will fetch the html title from the link to use it as the title of the wiki
-link in the reference, i.e.
+This bot will search for references which are only made of a link
+without title (i.e. <ref>[https://www.google.fr/]</ref> or
+<ref>https://www.google.fr/</ref>) and will fetch the html title from
+the link to use it as the title of the wiki link in the reference, i.e.
<ref>[https://www.google.fr/search?q=test test - Google Search]</ref>
-The bot checks every 20 edits a special stop page. If the page has been edited,
-it stops.
+The bot checks every 20 edits a special stop page. If the page has been
+edited, it stops.
-Warning: Running this script on German Wikipedia is not allowed anymore.
-
-As it uses it, you need to configure noreferences.py for your wiki, or it will
-not work.
+As it uses it, you need to configure noreferences.py for your wiki, or it
+will not work.
pdfinfo is needed for parsing pdf titles.
@@ -22,22 +20,22 @@
-limit:n Stops after n edits
--xml:dump.xml Should be used instead of a simple page fetching method from
- pagegenerators.py for performance and load issues
+-xml:dump.xml Should be used instead of a simple page fetching method
+ from pagegenerators.py for performance and load issues
-xmlstart Page to start with when using an XML dump
--ignorepdf Do not handle PDF files (handy if you use Windows and can't
- get pdfinfo)
+-ignorepdf Do not handle PDF files (handy if you use Windows and
+ can't get pdfinfo)
--summary Use a custom edit summary. Otherwise it uses the default
- one from i18n/reflinks.py
+-summary Use a custom edit summary. Otherwise it uses the
+ default one from translatewiki
The following generators and filters are supported:
¶ms;
"""
-# (C) Pywikibot team, 2008-2020
+# (C) Pywikibot team, 2008-2021
#
# Distributed under the terms of the MIT license.
#
@@ -739,24 +737,17 @@
gen_factory = pagegenerators.GeneratorFactory()
for arg in local_args:
- if arg.startswith('-summary:'):
- options['summary'] = arg[9:]
- elif arg in ('-always', '-ignorepdf'):
- options[arg[1:]] = True
- elif arg.startswith('-limit:'):
- options['limit'] = int(arg[7:])
- elif arg.startswith('-xmlstart'):
- if len(arg) == 9:
- xml_start = pywikibot.input(
- 'Please enter the dumped article to start with:')
- else:
- xml_start = arg[10:]
- elif arg.startswith('-xml'):
- if len(arg) == 4:
- xml_filename = pywikibot.input(
- "Please enter the XML dump's filename:")
- else:
- xml_filename = arg[5:]
+ opt, _, value = arg.partition(':')
+ if opt in ('-summary', '-limit'):
+ options[opt[1:]] = value
+ elif opt in ('-always', '-ignorepdf'):
+ options[opt[1:]] = True
+ elif opt == '-xmlstart':
+ xml_start = value or pywikibot.input(
+ 'Please enter the dumped article to start with:')
+ elif opt == '-xml':
+ xml_filename = value or pywikibot.input(
+ "Please enter the XML dump's filename:")
else:
gen_factory.handle_arg(arg)
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/669360
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: Ie5f58a5680491656919eb5c57ec01230fcad79d3
Gerrit-Change-Number: 669360
Gerrit-PatchSet: 1
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki(a)aol.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged
jenkins-bot has submitted this change. ( https://gerrit.wikimedia.org/r/c/pywikibot/core/+/669349 )
Change subject: [scripts] Restore reflinks.py
......................................................................
[scripts] Restore reflinks.py
Bug: T223826
Change-Id: I8e5d121de740930de6fbcd7f4fa3bd2b9173fa81
---
M docs/scripts/scripts.rst
M scripts/README.rst
R scripts/reflinks.py
M tox.ini
4 files changed, 12 insertions(+), 4 deletions(-)
Approvals:
Xqt: Looks good to me, approved
jenkins-bot: Verified
diff --git a/docs/scripts/scripts.rst b/docs/scripts/scripts.rst
index b72acdf..291246f 100644
--- a/docs/scripts/scripts.rst
+++ b/docs/scripts/scripts.rst
@@ -153,8 +153,14 @@
scripts.redirect script
-----------------------
+
.. automodule:: scripts.redirect
+scripts.reflinks script
+-----------------------
+
+.. automodule:: scripts.reflinks
+
scripts.replace script
----------------------
diff --git a/scripts/README.rst b/scripts/README.rst
index ca5e9a7..36f1217 100644
--- a/scripts/README.rst
+++ b/scripts/README.rst
@@ -91,6 +91,10 @@
| | solve_disambiguation also has functions which treat |
| | redirects. |
+------------------------+---------------------------------------------------------+
+ | reflinks.py | Search for references which are only made of a link |
+ | | without title and fetch the html title from the link to |
+ | | use it as the title of the wiki link in the reference. |
+ +------------------------+---------------------------------------------------------+
| replace.py | Search articles for a text and replace it by another |
| | text. Both text are set in two configurable |
| | text files. The bot can either work on a set of given |
@@ -133,6 +137,7 @@
+------------------------+---------------------------------------------------------+
| archive | Scripts no longer maintained. |
+ | | Please open a Phabricator task if you want to use one. |
+========================+=========================================================+
| blockpagechecker.py | Deletes any protection templates that are on pages |
| | which aren't actually protected. |
@@ -210,10 +215,6 @@
+------------------------+---------------------------------------------------------+
| protect.py | Protect and unprotect pages en masse. |
+------------------------+---------------------------------------------------------+
- | reflinks.py | Search for references which are only made of a link |
- | | without title and fetch the html title from the link to |
- | | use it as the title of the wiki link in the reference. |
- +------------------------+---------------------------------------------------------+
| selflink.py | This bot goes over multiple pages of the home wiki, |
| | searches for selflinks, and allows removing them. |
+------------------------+---------------------------------------------------------+
diff --git a/scripts/archive/reflinks.py b/scripts/reflinks.py
similarity index 100%
rename from scripts/archive/reflinks.py
rename to scripts/reflinks.py
diff --git a/tox.ini b/tox.ini
index 6a89ee0..1dd0d9d 100644
--- a/tox.ini
+++ b/tox.ini
@@ -161,6 +161,7 @@
scripts/movepages.py : N802, N803, N806
scripts/noreferences.py : N802, N803, N806, N816
scripts/redirect.py : N803, N806
+ scripts/reflinks.py : N802, N816
scripts/replace.py : N802, N803, N806, N816
scripts/solve_disambiguation.py : N802, N803, N806
scripts/templatecount.py: N802
--
To view, visit https://gerrit.wikimedia.org/r/c/pywikibot/core/+/669349
To unsubscribe, or for help writing mail filters, visit https://gerrit.wikimedia.org/r/settings
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Change-Id: I8e5d121de740930de6fbcd7f4fa3bd2b9173fa81
Gerrit-Change-Number: 669349
Gerrit-PatchSet: 2
Gerrit-Owner: Xqt <info(a)gno.de>
Gerrit-Reviewer: D3r1ck01 <xsavitar.wiki(a)aol.com>
Gerrit-Reviewer: Xqt <info(a)gno.de>
Gerrit-Reviewer: jenkins-bot
Gerrit-MessageType: merged