jenkins-bot submitted this change.
[bugfix] Change regex to detect meta information
- enable meta with charset but without content-type
- enable quotes with charset information
Bug: T298006
Change-Id: Ie1c56848d5485ee91a32c6c0bc75264c018ba05b
---
M scripts/reflinks.py
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/scripts/reflinks.py b/scripts/reflinks.py
index c37296e..b3e425e 100755
--- a/scripts/reflinks.py
+++ b/scripts/reflinks.py
@@ -481,9 +481,11 @@
.format(self.stop_page.title(as_link=True)))
# Regex to grasp content-type meta HTML tag in HTML source
- self.META_CONTENT = re.compile(br'(?i)<meta[^>]*content\-type[^>]*>')
+ self.META_CONTENT = re.compile(
+ br'(?i)<meta[^>]*(?:content\-type|charset)[^>]*>')
# Extract the encoding from a charset property (from content-type !)
- self.CHARSET = re.compile(r'(?i)charset\s*=\s*(?P<enc>[^\'",;>/]*)')
+ self.CHARSET = re.compile(
+ r'(?i)charset\s*=\s*(?P<enc>(?P<q>[\'"]?)[^\'",;>/]*(?P=q))')
# Extract html title from page
self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)')
# Matches content inside <script>/<style>/HTML comments
To view, visit change 748371. To unsubscribe, or for help writing mail filters, visit settings.