Commit 0fdd042d authored by Jerome Flesch's avatar Jerome Flesch

Merge branch: Complete rewrite of unit tests

Closes #104

See merge request !108
parents 222d7aee 666c9e5e
......@@ -7,3 +7,4 @@ venv*/
.tox/
.egg*/
_version.py
.coverage
......@@ -281,16 +281,10 @@ Beware this code hasn't been adapted to libtesseract 3 yet.
```sh
make check # requires pyflake8
make test # requires tox
make test # requires tox, pytest and python3
```
Tests are made to be run with the latest versions of Tesseract and Cuneiform.
the first tests verify that you're using the expected version.
To run the tesseract tests, you will need the following lang data files:
- English (tesseract-ocr-eng)
- French (tesseract-ocr-fra)
- Japanese (tesseract-ocr-jpn)
Tests are made to be run without external dependencies (no Tesseract or Cuneiform needed).
## OCR on natural scenes
......
......@@ -14,6 +14,8 @@ except ImportError:
import xml.dom.minidom
import logging
import six
from .util import to_unicode
logger = logging.getLogger(__name__)
......@@ -38,6 +40,7 @@ _XHTML_HEADER = to_unicode("""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
""")
@six.python_2_unicode_compatible
class Box(object):
"""
Boxes are rectangles around each individual element recognized in the
......@@ -53,7 +56,6 @@ class Box(object):
tuple of tuple:
((box_pt_min_x, box_pt_min_y), (box_pt_max_x, box_pt_max_y))
"""
content = to_unicode(content)
self.content = content
self.position = position
self.confidence = confidence
......@@ -85,13 +87,13 @@ class Box(object):
return span_tag
def __str__(self):
return self.get_unicode_string().encode('utf-8')
return self.get_unicode_string()
def __box_cmp(self, other):
"""
Comparison function.
"""
if other is None:
if other is None or getattr(other, "position", None) is None:
return -1
for (x, y) in ((self.position[0][1], other.position[0][1]),
(self.position[1][1], other.position[1][1]),
......@@ -130,6 +132,7 @@ class Box(object):
return (position_hash ^ hash(self.content) ^ hash(self.content))
@six.python_2_unicode_compatible
class LineBox(object):
"""
Boxes are rectangles around each individual element recognized in the
......@@ -147,6 +150,14 @@ class LineBox(object):
self.word_boxes = word_boxes
self.position = position
@property
def content(self):
txt = u""
for box in self.word_boxes:
txt += box.content + u" "
txt = txt.strip()
return txt
def get_unicode_string(self):
"""
Return the string corresponding to the box, in unicode (utf8).
......@@ -164,37 +175,29 @@ class LineBox(object):
self.position[1][1],
)
def __get_content(self):
txt = to_unicode("")
for box in self.word_boxes:
txt += box.content + to_unicode(" ")
txt = txt.strip()
return txt
content = property(__get_content)
def get_xml_tag(self, parent_doc):
span_tag = parent_doc.createElement("span")
span_tag.setAttribute("class", "ocr_line")
span_tag.setAttribute("title", ("bbox %d %d %d %d" % (
(self.position[0][0], self.position[0][1],
self.position[1][0], self.position[1][1]))))
for box in self.word_boxes:
space = xml.dom.minidom.Text()
space.data = " "
span_tag.appendChild(space)
for box_idx, box in enumerate(self.word_boxes):
if box_idx:
space = xml.dom.minidom.Text()
space.data = " "
span_tag.appendChild(space)
box_xml = box.get_xml_tag(parent_doc)
span_tag.appendChild(box_xml)
return span_tag
def __str__(self):
return self.get_unicode_string().encode('utf-8')
return self.get_unicode_string()
def __box_cmp(self, other):
"""
Comparison function.
"""
if other is None:
if other is None or getattr(other, "position", None) is None:
return -1
for (x, y) in ((self.position[0][1], other.position[0][1]),
(self.position[1][1], other.position[1][1]),
......@@ -231,7 +234,7 @@ class LineBox(object):
position_hash += ((self.position[0][1] & 0xFF) << 8)
position_hash += ((self.position[1][0] & 0xFF) << 16)
position_hash += ((self.position[1][1] & 0xFF) << 24)
return (position_hash ^ hash(content) ^ hash(content))
return (position_hash ^ hash(content))
class BaseBuilder(object):
......@@ -253,39 +256,39 @@ class BaseBuilder(object):
self.cuneiform_args = cuneiform_args
# used with Tesseract and Cuneiform
def read_file(self, file_descriptor):
def read_file(self, file_descriptor): # pragma: no cover
"""
Read in the OCR results from `file_descriptor`
as an appropriate format.
"""
raise NotImplementedError("Implement in subclasses")
def write_file(self, file_descriptor, output):
def write_file(self, file_descriptor, output): # pragma: no cover
"""
Write the `output` to `file_descriptor`.
"""
raise NotImplementedError("Implement in subclasses")
# used with Libtesseract
def start_line(self, box):
def start_line(self, box): # pragma: no cover
"""
Start a new line of output.
"""
raise NotImplementedError("Implement in subclasses")
def add_word(self, word, box, confidence=0):
def add_word(self, word, box, confidence=0): # pragma: no cover
"""
Add a word to output.
"""
raise NotImplementedError("Implement in subclasses")
def end_line(self):
def end_line(self): # pragma: no cover
"""
End a line in output.
"""
raise NotImplementedError("Implement in subclasses")
def get_output(self):
def get_output(self): # pragma: no cover
"""
Return the output that has been built so far.
"""
......@@ -346,8 +349,7 @@ class TextBuilder(BaseBuilder):
def get_output(self):
return u"\n".join(self.built_text)
@staticmethod
def __str__():
def __str__(self):
return "Raw text"
......@@ -365,9 +367,8 @@ class DigitBuilder(TextBuilder):
The returned string is encoded in UTF-8.
"""
@staticmethod
def __str__():
return "Digits raw text."
def __str__(self):
return "Digits raw text"
def __init__(self, tesseract_layout=3):
super(DigitBuilder, self).__init__(tesseract_layout)
......@@ -472,8 +473,7 @@ class _WordHTMLParser(HTMLParser):
self.__current_line_content = []
return
@staticmethod
def __str__():
def __str__(self): # pragma: no cover
return "WordHTMLParser"
......@@ -514,7 +514,7 @@ class _LineHTMLParser(HTMLParser):
# strip x_bboxes
self.__char_positions = self.__char_positions[1:]
if self.__char_positions[-1] == "":
self.__char_positions[:-1]
self.__char_positions = self.__char_positions[:-1]
try:
while True:
self.__char_positions.remove("-1")
......@@ -550,8 +550,7 @@ class _LineHTMLParser(HTMLParser):
self.boxes.append(box)
self.__line_text = None
@staticmethod
def __str__():
def __str__(self): # pragma: no cover
return "LineHTMLParser"
......@@ -627,8 +626,7 @@ class WordBoxBuilder(BaseBuilder):
def get_output(self):
return self.word_boxes
@staticmethod
def __str__():
def __str__(self):
return "Word boxes"
......@@ -712,8 +710,7 @@ class LineBoxBuilder(BaseBuilder):
def get_output(self):
return self.lines
@staticmethod
def __str__():
def __str__(self):
return "Line boxes"
......@@ -726,8 +723,7 @@ class DigitLineBoxBuilder(LineBoxBuilder):
unable to process the input this way.
"""
@staticmethod
def __str__():
def __str__(self):
return "Digit line boxes"
def __init__(self, tesseract_layout=1):
......
......@@ -17,13 +17,12 @@ https://github.com/openpaperwork/pyocr#readme
import codecs
from io import BytesIO
import os
import re
import subprocess
import tempfile
from . import builders
from . import error
from .error import CuneiformError
from . import util
......@@ -37,7 +36,7 @@ CUNEIFORM_DATA_POSSIBLE_PATHS = [
LANGUAGES_LINE_PREFIX = "Supported languages: "
LANGUAGES_SPLIT_RE = re.compile("[^a-z]")
VERSION_LINE_RE = re.compile("Cuneiform for \w+ (\d+).(\d+).(\d+)")
VERSION_LINE_RE = re.compile(r"Cuneiform for \w+ (\d+).(\d+).(\d+)")
__all__ = [
'can_detect_orientation',
......@@ -63,30 +62,15 @@ def get_available_builders():
return [
builders.TextBuilder,
builders.WordBoxBuilder,
builders.LineBoxBuilder,
]
class CuneiformError(error.PyocrException):
def __init__(self, status, message):
error.PyocrException.__init__(self, message)
self.status = status
self.message = message
self.args = (status, message)
def temp_file(suffix):
''' Returns a temporary file '''
return tempfile.NamedTemporaryFile(prefix='cuneiform_', suffix=suffix)
def cleanup(filename):
''' Tries to remove the given filename. Ignores non-existent files '''
try:
os.remove(filename)
except OSError:
pass
def image_to_string(image, lang=None, builder=None):
if builder is None:
builder = builders.TextBuilder()
......@@ -152,8 +136,8 @@ def get_version():
proc.wait()
for line in output.split("\n"):
m = VERSION_LINE_RE.match(line)
g = m.groups()
if m is not None:
g = m.groups()
ver = (int(g[0]), int(g[1]), int(g[2]))
return ver
return None
......@@ -11,3 +11,11 @@ class TesseractError(PyocrException):
self.status = status
self.message = message
self.args = (status, message)
class CuneiformError(PyocrException):
def __init__(self, status, message):
PyocrException.__init__(self, message)
self.status = status
self.message = message
self.args = (status, message)
......@@ -20,16 +20,10 @@ from . import tesseract_raw
from ..error import TesseractError
from ..util import digits_only
import locale
import logging
logger = logging.getLogger(__name__)
# Tesseract 4 workaround
if tesseract_raw.get_version() == "4.0.0":
locale.setlocale(locale.LC_ALL, "C")
__all__ = [
'can_detect_orientation',
'detect_orientation',
......@@ -85,6 +79,9 @@ def get_available_builders():
return [
builders.TextBuilder,
builders.WordBoxBuilder,
builders.DigitBuilder,
builders.LineBoxBuilder,
builders.DigitLineBoxBuilder,
]
......
import ctypes
import locale
import logging
import os
import sys
......@@ -10,13 +11,12 @@ logger = logging.getLogger(__name__)
TESSDATA_PREFIX = os.getenv('TESSDATA_PREFIX', None)
libnames = []
# 70 is the minimum credible dpi for tesseract and force it to compute an
# estimate of the image dpi
DPI_DEFAULT = 70
if getattr(sys, 'frozen', False):
if getattr(sys, 'frozen', False): # pragma: no cover
# Pyinstaller integration
libnames += [os.path.join(sys._MEIPASS, "libtesseract-4.dll")]
libnames += [os.path.join(sys._MEIPASS, "libtesseract-3.dll")]
......@@ -31,7 +31,7 @@ if getattr(sys, 'frozen', False):
TESSDATA_PREFIX = tessdata
if sys.platform[:3] == "win":
if sys.platform[:3] == "win": # pragma: no cover
libnames += [
# Jflesch> Don't they have the equivalent of LD_LIBRARY_PATH on
# Windows ?
......@@ -56,12 +56,12 @@ else:
g_libtesseract = None
lib_load_errors = []
for libname in libnames:
for libname in libnames: # pragma: no branch
try:
g_libtesseract = ctypes.cdll.LoadLibrary(libname)
lib_load_errors = []
break
except OSError as ex:
except OSError as ex: # pragma: no cover
if hasattr(ex, 'message'):
# python 2
lib_load_errors.append((libname, ex.message))
......@@ -134,7 +134,7 @@ class OSResults(ctypes.Structure):
]
if g_libtesseract:
if g_libtesseract: # pragma: no cover
g_libtesseract.TessVersion.argtypes = []
g_libtesseract.TessVersion.restype = ctypes.c_char_p
......@@ -348,12 +348,17 @@ if g_libtesseract:
def init(lang=None):
assert(g_libtesseract)
# Tesseract 4 workaround
if get_version() == "4.0.0":
locale.setlocale(locale.LC_ALL, "C")
handle = g_libtesseract.TessBaseAPICreate()
try:
if lang:
lang = lang.encode("utf-8")
prefix = None
if TESSDATA_PREFIX:
if TESSDATA_PREFIX: # pragma: no cover
prefix = TESSDATA_PREFIX.encode("utf-8")
g_libtesseract.TessBaseAPIInit3(
ctypes.c_void_p(handle),
......
......@@ -17,6 +17,7 @@ https://github.com/openpaperwork/pyocr#readme
'''
import codecs
import errno
import logging
import os
import subprocess
......@@ -31,6 +32,12 @@ from .builders import DigitBuilder # backward compatibility
from .error import TesseractError # backward compatibility
from .util import digits_only
try:
FileNotFoundError
except NameError:
# python2 does not have FileNotFoundError
FileNotFoundError = IOError
# CHANGE THIS IF TESSERACT IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY
TESSERACT_CMD = 'tesseract.exe' if os.name == 'nt' else 'tesseract'
......@@ -105,8 +112,7 @@ class CharBoxBuilder(builders.BaseBuilder):
for box in boxes:
file_descriptor.write(box.get_unicode_string() + " 0\n")
@staticmethod
def __str__():
def __str__(self):
return "Character boxes"
......@@ -114,13 +120,13 @@ def _set_environment():
global g_subprocess_startup_info
global g_creation_flags
if os.name == "nt":
if os.name == "nt": # pragma: no cover
g_subprocess_startup_info = subprocess.STARTUPINFO()
g_subprocess_startup_info.wShowWindow = subprocess.SW_HIDE
g_subprocess_startup_info.dwFlags |= subprocess.STARTF_USESHOWWINDOW
g_creation_flags = 0x08000000 # CREATE_NO_WINDOW
if getattr(sys, 'frozen', False):
if getattr(sys, 'frozen', False): # pragma: no cover
# Pyinstaller support
path = os.environ["PATH"]
if sys._MEIPASS in path:
......@@ -241,6 +247,7 @@ def get_available_builders():
builders.WordBoxBuilder,
CharBoxBuilder,
builders.DigitBuilder,
builders.DigitLineBoxBuilder,
]
......@@ -298,11 +305,11 @@ def cleanup(filename):
''' Tries to remove the given filename. Ignores non-existent files '''
try:
os.remove(filename)
except OSError:
except OSError: # pragma: no cover
pass
class ReOpenableTempfile(object):
class ReOpenableTempfile(object): # pragma: no cover
"""
On Windows, `tempfile.NamedTemporaryFile` occur Permission denied Error
when file is still open.
......@@ -372,23 +379,32 @@ def image_to_string(image, lang=None, builder=None):
if status:
raise TesseractError(status, errors)
tested_files = []
output_file_name = "ERROR"
for file_extension in builder.file_extensions:
output_file_name = ('%s.%s' % (os.path.join(tmpdir, "output"),
file_extension))
if not os.access(output_file_name, os.F_OK):
continue
tested_files.append(output_file_name)
try:
with codecs.open(output_file_name, 'r', encoding='utf-8',
errors='replace') as file_desc:
results = builder.read_file(file_desc)
return results
return builder.read_file(file_desc)
except FileNotFoundError as exc:
if sys.version_info < (3, 0):
# python2 has no FileNotFoundError specifid Exception
# so we rely on the errno of the IOError exception
if exc.errno == errno.ENOENT:
# file not found
continue
else:
raise exc
continue
finally:
cleanup(output_file_name)
break
raise TesseractError(-1, "Unable to find output file"
" last name tried: %s" % output_file_name)
raise TesseractError(
-1, "Unable to find output file (tested {})".format(tested_files)
)
def is_available():
......@@ -446,9 +462,6 @@ def get_version():
try:
ver_string = ver_string.split(" ")[1]
index = ver_string.find("dev")
if index:
ver_string = ver_string[:index]
els = ver_string.split(".")
els = [digits_only(x) for x in els]
......@@ -457,12 +470,13 @@ def get_version():
upd = 0
if len(els) >= 3:
upd = els[2]
return (major, minor, upd)
version = (major, minor, upd)
if version == (0, 0, 0):
raise TesseractError(
ret, ("Unable to parse Tesseract version (not a number): [%s]"
% (ver_string)))
return version
except IndexError:
raise TesseractError(
ret, ("Unable to parse Tesseract version (spliting failed): [%s]"
% (ver_string)))
except ValueError:
raise TesseractError(
ret, ("Unable to parse Tesseract version (not a number): [%s]"
% (ver_string)))
......@@ -17,7 +17,7 @@ def digits_only(string):
def to_unicode(string):
try:
return six.u(string)
except: # noqa: E722
except: # noqa: E722 # pragma: no cover
# probably already decoded
return string
......
T 105 705 130 734 0
h 136 704 155 734 0
e 161 703 178 725 0
( 205 696 216 732 0
q 222 694 241 725 0
u 247 702 266 723 0
i 273 703 281 733 0
c 288 701 305 724 0
k 310 702 329 732 0
) 335 695 347 730 0
[ 376 693 386 731 0
b 391 700 411 731 0
r 416 701 431 722 0
o 434 699 454 722 0
w 459 699 487 721 0
n 493 699 513 721 0
] 518 691 528 729 0
{ 559 691 572 728 0
f 580 698 598 729 0
o 597 697 617 720 0
x 622 698 642 718 0
} 650 690 663 727 0
j 687 688 700 727 0
u 706 695 727 717 0
m 732 696 764 718 0
p 768 687 789 717 0
s 794 694 809 717 0
! 817 694 823 726 0
O 104 654 133 685 0
v 137 654 157 675 0
e 162 653 179 675 0
T 105 705 130 734 0
h 136 704 155 734 0
e 161 703 178 725 0
( 205 696 216 732 0
q 222 694 241 725 0
u 247 702 266 723 0
i 273 703 281 733 0
c 288 701 305 724 0
k 310 702 329 732 0
) 335 695 347 730 0
[ 376 693 386 731 0
b 391 700 411 731 0
r 416 701 431 722 0
o 434 699 454 722 0
w 459 699 487 721 0
n 493 699 513 721 0
] 518 691 528 729 0
{ 559 691 572 728 0
f 580 698 598 729 0
o 597 697 617 720 0
x 622 698 642 718 0
} 650 690 663 727 0
j 687 688 700 727 0
u 706 695 727 717 0
m 732 696 764 718 0
p 768 687 789 717 0
s 794 694 809 717 0
! 817 694 823 726 0
O 104 654 133 685 0
v 137 654 157 675 0
e 162 653 179 675 0
# 1 2 3 4
......@@ -5,11 +5,18 @@
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name='ocr-system' content='tesseract 3.04.01' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
<meta name='ocr-system' content='tesseract 4.0.0' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "input/specific/test-digits.png"; bbox 0 0 275 36; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 4 0 275 32">
<p class='ocr_par' id='par_1_1' lang='eng' title="bbox 4 0 275 32">
<span class='ocr_line' id='line_1_1' title="bbox 4 0 275 32; baseline 0 0; x_size 43.333332; x_descenders 10.833333; x_ascenders 10.833333">
<span class='ocrx_word' id='word_1_1' title='bbox 4 0 275 32; x_wconf 68'>3355456544</span>
</span>
</p>
</div>
</div>
</body>
</html>
Phrae en français.
Phrase en français.
Avec des accents.
Ephémère
\ No newline at end of file
Éphémère
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
</head>
<body>
<span class="ocrx_word" title="bbox 105 66 178 97">The</span><br/>
<span class="ocrx_word" title="bbox 205 67 347 106">(quick)</span><br/>
<span class="ocrx_word" title="bbox 376 69 528 109">[brown]</span><br/>
<span class="ocrx_word" title="bbox 559 71 663 110">{fox}</span><br/>
<span class="ocrx_word" title="bbox 687 73 823 113">jumps!</span><br/>
<span class="ocrx_word" title="bbox 104 115 199 147">Over</span><br/>
<span class="ocrx_word" title="bbox 224 117 283 148">the</span><br/>
<span class="ocrx_word" title="bbox 310 117 533 155">$43,456.78</span><br/>
<span class="ocrx_word" title="bbox 561 121 696 162">&lt;lazy&gt;</span><br/>
<span class="ocrx_word" title="bbox 722 123 791 154">#90</span><br/>
<span class="ocrx_word" title="bbox 818 125 887 165">dog</span><br/>
<span class="ocrx_word" title="bbox 103 165 134 196">&amp;</span><br/>
<span class="ocrx_word" title="bbox 160 166 396 206">duck/goose,</span><br/>
<span class="ocrx_word" title="bbox 424 178 463 201">as</span><br/>