Commit f36f2492 authored by Gian Luca Dalla Torre's avatar Gian Luca Dalla Torre

Add digits support for pyocr (Tesseract only).

parent db6ec7af
......@@ -44,11 +44,16 @@ bmp, tiff, and others. It also support bounding box data.
line_and_word_boxes = tool.image_to_string(
Image.open('test.png'), lang=lang,
builder=pyocr.builders.LineBoxBuilder())
# Digits - Only Tesseract
digits = tool.image_to_string(Image.open('test-digits.png'),
lang=lang,
builder=pyocr.tesseract.DigitBuilder())
## Dependencies
* Pyocr requires python 2.7 or later.
* Pyocr requires python 2.6 or later.
* You will need [Pillow](https://github.com/python-imaging/Pillow)
or Python Imaging Library (PIL). Under Debian/Ubuntu, PIL is in
the package "python-imaging".
......
......@@ -44,6 +44,7 @@ TESSDATA_EXTENSION = ".traineddata"
__all__ = [
'CharBoxBuilder',
'DigitBuilder'
'get_available_builders',
'get_available_languages',
'get_name',
......@@ -105,6 +106,21 @@ class CharBoxBuilder(object):
return "Character boxes"
class DigitBuilder(builders.TextBuilder):
"""
If passed to image_to_string(), image_to_string() will return a string with only digits.
Characters recognition will consider text as if it will only composed by digits
"""
@staticmethod
def __str__():
return "Digits only"
def __init__(self, tesseract_layout=3):
super(DigitBuilder, self).__init__(tesseract_layout)
self.tesseract_configs.append("digits")
def get_name():
return "Tesseract"
......
3355456544
\ No newline at end of file
......@@ -155,6 +155,32 @@ class TestCharBox(unittest.TestCase):
pass
class TestDigits(unittest.TestCase):
"""
These tests make sure that Tesseract digits handling works fine.
"""
def setUp(self):
self.builder = tesseract.DigitBuilder()
def __test_text(self, image_file, expected_output_file, lang='eng'):
image_file = "tests/data/" + image_file
expected_output_file = "tests/tesseract/" + expected_output_file
expected_output = ""
with codecs.open(expected_output_file, 'r', encoding='utf-8') \
as file_descriptor:
for line in file_descriptor:
expected_output += line
expected_output = expected_output.strip()
output = tesseract.image_to_string(Image.open(image_file), lang=lang, builder=self.builder)
self.assertEqual(output, expected_output)
def test_digits(self):
self.__test_text('test-digits.png', 'test-digits.txt')
class TestWordBox(unittest.TestCase):
"""
These tests make sure that Tesseract box handling works fine.
......@@ -296,6 +322,7 @@ class TestLineBox(unittest.TestCase):
def tearDown(self):
pass
def get_all_tests():
all_tests = unittest.TestSuite()
......@@ -329,4 +356,10 @@ def get_all_tests():
tests = unittest.TestSuite(map(TestLineBox, test_names))
all_tests.addTest(tests)
test_names = [
'test_digits'
]
tests = unittest.TestSuite(map(TestDigits, test_names))
all_tests.addTest(tests)
return all_tests
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment