Commit c498d55d authored by Jerome Flesch's avatar Jerome Flesch

Make room for support of other OCR tools

Signed-off-by: Jerome Flesch's avatarJerome Flesch <jflesch@gmail.com>
parent d80ad629
#!/usr/bin/env python
import sys
sys.path = [ "src" ] + sys.path
import unittest
from tests import tests
import pyocr
from tests import tests_tesseract
if __name__ == '__main__':
unittest.TextTestRunner().run(tests.get_all_tests())
print "OCR tool found:"
for tool in pyocr.get_available_tools():
print "- %s" % tool.get_name()
print "---"
print "Tesseract:"
unittest.TextTestRunner().run(tests_tesseract.get_all_tests())
import xml
__all__ = [
'Box',
'TextBuilder',
'WordBoxBuilder',
]
class Box(object):
"""
Boxes are rectangles around each individual element recognized in the image.
Elements are either char or word depending of the builder that was used.
"""
def __init__(self, content, position):
"""
Arguments:
content --- a single string
position --- the position of the box on the image. Given as a
tuple of tuple:
((width_pt_x, height_pt_x), (width_pt_y, height_pt_y))
"""
self.content = content
self.position = position
def get_unicode_string(self):
"""
Return the string corresponding to the box, in unicode (utf8).
This string can be stored in a file as-is (see write_box_file())
and reread using read_box_file().
"""
return "%s %d %d %d %d" % (
self.content,
self.position[0][0],
self.position[0][1],
self.position[1][0],
self.position[1][1],
)
def get_xml_tag(self):
span_tag = xml.dom.minidom.Element("span")
span_tag.setAttribute("class", "ocr_word")
span_tag.setAttribute("title", ("bbox %d %d %d %d" % (
(self.position[0][0], self.position[0][1],
self.position[1][0], self.position[1][1]))))
txt = xml.dom.minidom.Text()
txt.data = self.content
span_tag.appendChild(txt)
return span_tag
def __str__(self):
return self.get_unicode_string().encode('ascii', 'replace')
def __box_cmp(self, other):
"""
Comparison function.
"""
if other == None:
return -1
for cmp_result in (cmp(self.position[0][1], other.position[0][1]),
cmp(self.position[1][1], other.position[1][1]),
cmp(self.position[0][0], other.position[0][0]),
cmp(self.position[1][0], other.position[1][0])):
if cmp_result != 0:
return cmp_result
return 0
def __lt__(self, other):
return self.__box_cmp(other) < 0
def __gt__(self, other):
return self.__box_cmp(other) > 0
def __eq__(self, other):
return self.__box_cmp(other) == 0
def __le__(self, other):
return self.__box_cmp(other) <= 0
def __ge__(self, other):
return self.__box_cmp(other) >= 0
def __ne__(self, other):
return self.__box_cmp(other) != 0
def __hash__(self):
position_hash = 0
position_hash += ((self.position[0][0] & 0xFF) << 0)
position_hash += ((self.position[0][1] & 0xFF) << 8)
position_hash += ((self.position[1][0] & 0xFF) << 16)
position_hash += ((self.position[1][1] & 0xFF) << 24)
return (position_hash ^ hash(self.content) ^ hash(self.content))
class TextBuilder(object):
"""
If passed to image_to_string(), image_to_string() will return a simple
string. This string will be the output of the OCR tool, as-is. In other
words, the raw text as produced by the tool.
Warning:
The returned string is encoded in UTF-8
"""
file_extension = "txt"
tesseract_configs = []
def __init__(self):
pass
@staticmethod
def read_file(file_descriptor):
"""
Read a file and extract the content as a string
"""
return file_descriptor.read().strip()
@staticmethod
def write_file(file_descriptor, text):
"""
Write a string in a file
"""
file_descriptor.write(text)
@staticmethod
def __str__():
return "Raw text"
class WordBoxBuilder(object):
"""
If passed to image_to_string(), image_to_string() will return an array of
Box. Each box contains a word recognized in the image.
"""
file_extension = "html"
tesseract_configs = ['hocr']
def __init__(self):
pass
@staticmethod
def __parse_position(xml_tag):
title = xml_tag.getAttribute("title")
title = title.split("; ")
title = title[-1]
title = title.split(" ")
position = ((int(title[1]), int(title[2])),
(int(title[3]), int(title[4])))
return position
@staticmethod
def __extract_txt(xml_tag):
txt = u""
for tag in xml_tag.childNodes:
if tag.nodeType == tag.TEXT_NODE:
txt += tag.wholeText
else:
txt += WordBoxBuilder.__extract_txt(tag)
return txt
element = elements[0]
@staticmethod
def read_file(file_descriptor):
"""
Extract of set of Box from the lines of 'file_descriptor'
Return:
An array of Box.
"""
xml_string = file_descriptor.read().encode("utf-8")
xml_doc = xml.dom.minidom.parseString(xml_string)
boxes = []
for tag in xml_doc.getElementsByTagName("span"):
if ("ocr_word" != tag.getAttribute("class")):
continue
txt = WordBoxBuilder.__extract_txt(tag)
position = WordBoxBuilder.__parse_position(tag)
box = Box(txt, position)
boxes.append(box)
return boxes
@staticmethod
def write_file(file_descriptor, boxes):
"""
Write boxes in a box file. Output is a *very* *simplified* version
of hOCR.
Warning:
The file_descriptor must support UTF-8 ! (see module 'codecs')
"""
file_descriptor.write(u"<body>\n")
for box in boxes:
file_descriptor.write(box.get_xml_tag().toxml() + u"\n")
file_descriptor.write(u"</body>\n")
@staticmethod
def __str__():
return "Word boxes"
#!/usr/bin/env python
"""
Wrapper for various OCR tools.
USAGE:
import Image
import sys
from pyocr import pyocr
tools = pyocr.get_available_tools()[:]
if len(tools) == 0:
print "No OCR tool found"
sys.exit(1)
print "Using '%s'" % (tools[0].get_name())
tools[0].image_to_string(Image.open('test.png'), lang='fra',
builder=TextBuilder())
DETAILS:
Each module wrapping an OCR tool provides the following functions:
- get_name(): Return the name of the tool
- is_available(): Returns True if the tool is installed. False else.
- get_version(): Return a tuple containing the version of the tool (if
installed)
- get_available_builders(): Returns a list of builders that can be used with
this tool (see image_to_string())
- get_available_languages(): Returns a list of languages supported by this
tool. Languages are usually written using ISO 3 letters country codes
- image_to_string():
Takes 3 arguments:
- an image (see python Imaging "Image" module) (mandatory)
- lang=<language> (see get_available_languages()) (optional)
- builder=<builder> (see get_available_builders() or the classes in the
module 'pyocr.builders') (optional: default is
pyocr.builders.TextBuilder)
Returned value depends of the specified builder.
COPYRIGHT:
Pyocr is released under the GPL v3.
Copyright (c) Jerome Flesch, 2011
Tesseract module: Copyright (c) Samuel Hoffstaetter, 2009
WEBSITE:
https://github.com/jflesch/python-tesseract#readme
"""
import tesseract
__all__ = [
'get_available_tools',
'TOOLS',
]
TOOLS = [ # in preference order
tesseract
]
def get_available_tools():
"""
Return a list of OCR tools available on the local system.
"""
available = []
for tool in TOOLS:
if tool.is_available():
available.append(tool)
return available
......@@ -23,6 +23,8 @@ import subprocess
import sys
import xml.dom.minidom
import builders
# CHANGE THIS IF TESSERACT IS NOT IN YOUR PATH, OR IS NAMED DIFFERENTLY
TESSERACT_CMD = 'tesseract'
......@@ -36,139 +38,17 @@ TESSDATA_EXTENSION = ".traineddata"
__all__ = [
'Box',
'CharBoxBuilder',
'get_available_builders',
'get_available_languages',
'get_name'
'get_version',
'image_to_string',
'is_available',
'read_box_file',
'TesseractError',
'TextBuilder',
'WordBoxBuilder',
'write_box_file',
]
class TextBuilder(object):
"""
If passed to image_to_string(), image_to_string() will return a simple
string. This string will be the output of Tesseract, as-is.
Warning:
The returned string is encoded in UTF-8
"""
file_extension = "txt"
tesseract_configs = []
def __init__(self):
pass
@staticmethod
def read_file(file_descriptor):
"""
Read a file and extract the content as a string
"""
return file_descriptor.read().strip()
@staticmethod
def write_file(file_descriptor, text):
"""
Write a string in a file
"""
file_descriptor.write(text)
class Box(object):
"""
Tesseract Box: Tesseract boxes are rectangles around each individual
element recognized in the image. Elements are char or word depending
of the builder.
"""
def __init__(self, content, position):
"""
Instantiate a Tesseract box
Arguments:
content --- a single string
position --- the position of the box on the image. Given as a
tuple of tuple:
((width_pt_x, height_pt_x), (width_pt_y, height_pt_y))
"""
self.content = content
self.position = position
def get_unicode_string(self):
"""
Return the string corresponding to the box, in unicode (utf8).
This string can be stored in a file as-is (see write_box_file())
and reread using read_box_file().
"""
return "%s %d %d %d %d" % (
self.content,
self.position[0][0],
self.position[0][1],
self.position[1][0],
self.position[1][1],
)
def get_xml_tag(self):
span_tag = xml.dom.minidom.Element("span")
span_tag.setAttribute("class", "ocr_word")
span_tag.setAttribute("title", ("bbox %d %d %d %d" % (
(self.position[0][0], self.position[0][1],
self.position[1][0], self.position[1][1]))))
txt = xml.dom.minidom.Text()
txt.data = self.content
span_tag.appendChild(txt)
return span_tag
def __str__(self):
return self.get_unicode_string().encode('ascii', 'replace')
def __box_cmp(self, other):
"""
Comparison function.
"""
if other == None:
return -1
for cmp_result in (cmp(self.position[0][1], other.position[0][1]),
cmp(self.position[1][1], other.position[1][1]),
cmp(self.position[0][0], other.position[0][0]),
cmp(self.position[1][0], other.position[1][0])):
if cmp_result != 0:
return cmp_result
return 0
def __lt__(self, other):
return self.__box_cmp(other) < 0
def __gt__(self, other):
return self.__box_cmp(other) > 0
def __eq__(self, other):
return self.__box_cmp(other) == 0
def __le__(self, other):
return self.__box_cmp(other) <= 0
def __ge__(self, other):
return self.__box_cmp(other) >= 0
def __ne__(self, other):
return self.__box_cmp(other) != 0
def __hash__(self):
position_hash = 0
position_hash += ((self.position[0][0] & 0xFF) << 0)
position_hash += ((self.position[0][1] & 0xFF) << 8)
position_hash += ((self.position[1][0] & 0xFF) << 16)
position_hash += ((self.position[1][1] & 0xFF) << 24)
return (position_hash ^ hash(self.content) ^ hash(self.content))
class CharBoxBuilder(object):
"""
If passed to image_to_string(), image_to_string() will return an array of
......@@ -199,7 +79,7 @@ class CharBoxBuilder(object):
continue
position = ((int(elements[1]), int(elements[2])),
(int(elements[3]), int(elements[4])))
box = Box(elements[0], position)
box = builders.Box(elements[0], position)
boxes.append(box)
return boxes
......@@ -215,74 +95,21 @@ class CharBoxBuilder(object):
for box in boxes:
file_descriptor.write(box.get_unicode_string() + " 0\n")
class WordBoxBuilder(object):
"""
If passed to image_to_string(), image_to_string() will return an array of
Box. Each box correspond to a word recognized in the image.
"""
file_extension = "html"
tesseract_configs = ['hocr']
def __init__(self):
pass
@staticmethod
def __parse_position(xml_tag):
title = xml_tag.getAttribute("title")
title = title.split("; ")
title = title[-1]
title = title.split(" ")
position = ((int(title[1]), int(title[2])),
(int(title[3]), int(title[4])))
return position
@staticmethod
def __extract_txt(xml_tag):
txt = u""
for tag in xml_tag.childNodes:
if tag.nodeType == tag.TEXT_NODE:
txt += tag.wholeText
else:
txt += WordBoxBuilder.__extract_txt(tag)
return txt
element = elements[0]
def __str__():
return "Character boxes"
@staticmethod
def read_file(file_descriptor):
"""
Extract of set of Box from the lines of 'file_descriptor'
Return:
An array of Box.
"""
xml_string = file_descriptor.read().encode("utf-8")
xml_doc = xml.dom.minidom.parseString(xml_string)
boxes = []
for tag in xml_doc.getElementsByTagName("span"):
if ("ocr_word" != tag.getAttribute("class")):
continue
txt = WordBoxBuilder.__extract_txt(tag)
position = WordBoxBuilder.__parse_position(tag)
box = Box(txt, position)
boxes.append(box)
return boxes
def get_name():
return "Tesseract"
@staticmethod
def write_file(file_descriptor, boxes):
"""
Write boxes in a box file. Output is a *very* *simplified* version
of hOCR.
Warning:
The file_descriptor must support UTF-8 ! (see module 'codecs')
"""
file_descriptor.write(u"<body>\n")
for box in boxes:
file_descriptor.write(box.get_xml_tag().toxml() + u"\n")
file_descriptor.write(u"</body>\n")
def get_available_builders():
return [
builders.TextBuilder,
builders.WordBoxBuilder,
CharBoxBuilder,
]
def run_tesseract(input_filename, output_filename_base, lang=None,
......@@ -378,7 +205,7 @@ def image_to_string(image, lang=None, builder=None):
'''
if builder == None:
builder = TextBuilder()
builder = builders.TextBuilder()
input_file_name = '%s.bmp' % tempnam()
output_file_name_base = tempnam()
......
......@@ -7,6 +7,7 @@ import tempfile
import unittest
import builders
import tesseract
......@@ -141,7 +142,7 @@ class TestWordBox(unittest.TestCase):
These tests make sure that Tesseract box handling works fine.
"""
def setUp(self):
self.builder = tesseract.WordBoxBuilder()
self.builder = builders.WordBoxBuilder()
def __test_txt(self, image_file, expected_box_file, lang='eng'):
image_file = "tests/" + image_file
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment