jenkins-bot merged this change.
proofreadpage_tests.py: fix failing OCR-tests
Text returned by OCR agorithm seems not determistic (e.g. one
possible reason might be that OCR tools used by such services
are updated and have better performances).
Test for text similarity instead of perfect text equality.
This is acceptabe as purpose of the test is to test the call to
OCR tools API, rather than the OCR process itself.
Bug: T225595
Change-Id: I500ddc6e76e0b5d4032b0f7a79e1632a9b5b3b24
---
M tests/proofreadpage_tests.py
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
index efa1518..282b6a7 100644
--- a/tests/proofreadpage_tests.py
+++ b/tests/proofreadpage_tests.py
@@ -7,6 +7,7 @@
#
from __future__ import absolute_import, division, unicode_literals
+import difflib
import json
import pywikibot
@@ -335,10 +336,10 @@
'year 1872,\nBY D. APPLETON & CO.,\nIn the Office '
'of the Librarian of Congress, at '
'Washington.\n\n'),
- 'ocr': (False, 'lam-mam, according to Act of Congress, in the '
- 'year 157-2,\nBY D. APPLEION Av CO.,\nIn the '
- 'Of\ufb01ce or the Librarian of '
- 'Congress, at Washington.\n\n'),
+ 'ocr': (False, 'EsTEnen, according to Act of Congress, in the '
+ 'year 1872,\nBy D. APPLETON & CO.,\nIn the '
+ 'Office of the Librarian of Congress, at '
+ 'Washington.\n\u000c'),
'googleOCR': (False, 'ENTERED, according to Act of Congress, in '
'the year 1572,\nBY D. APPLETON & CO.\n'
'In the Office of the Librarian of '
@@ -363,7 +364,8 @@
self.skipTest(text)
ref_error, ref_text = self.data['hocr']
self.assertEqual(error, ref_error)
- self.assertEqual(text, ref_text)
+ s = difflib.SequenceMatcher(None, text, ref_text)
+ self.assertGreater(s.ratio(), 0.9)
def test_do_ocr_phetools(self):
"""Test page._do_ocr(ocr_tool='phetools')."""
@@ -372,7 +374,8 @@
if error:
self.skipTest(text)
self.assertEqual(error, ref_error)
- self.assertEqual(text, ref_text)
+ s = difflib.SequenceMatcher(None, text, ref_text)
+ self.assertGreater(s.ratio(), 0.9)
def test_do_ocr_googleocr(self):
"""Test page._do_ocr(ocr_tool='googleOCR')."""
@@ -381,7 +384,8 @@
self.skipTest(text)
ref_error, ref_text = self.data['googleOCR']
self.assertEqual(error, ref_error)
- self.assertEqual(text, ref_text)
+ s = difflib.SequenceMatcher(None, text, ref_text)
+ self.assertGreater(s.ratio(), 0.9)
def test_ocr_googleocr(self):
"""Test page.ocr(ocr_tool='googleOCR')."""
@@ -391,7 +395,8 @@
self.assertIsInstance(exc, ValueError)
else:
ref_error, ref_text = self.data['googleOCR']
- self.assertEqual(text, ref_text)
+ s = difflib.SequenceMatcher(None, text, ref_text)
+ self.assertGreater(s.ratio(), 0.9)
@require_modules('bs4')
To view, visit change 517756. To unsubscribe, or for help writing mail filters, visit settings.