Commit 3d19ff34 authored by Jerome Flesch's avatar Jerome Flesch

Tesseract C-API: Implements image_to_string()

Signed-off-by: Jerome Flesch's avatarJerome Flesch <jflesch@gmail.com>
parent 771b1494
......@@ -241,7 +241,6 @@ class TextBuilder(object):
def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False,
cuneiform_fax=False, cuneiform_singlecolumn=False):
self.tesseract_configs = ["-psm", str(tesseract_layout)]
# Add custom cuneiform parameters if needed
if cuneiform_dotmatrix:
......@@ -250,7 +249,7 @@ class TextBuilder(object):
self.cuneiform_args.append("--fax")
if cuneiform_singlecolumn:
self.cuneiform_args.append("--singlecolumn")
pass
self.built_text = []
@staticmethod
def read_file(file_descriptor):
......@@ -266,6 +265,18 @@ class TextBuilder(object):
"""
file_descriptor.write(text)
def start_line(self, box):
self.built_text.append(u"")
def add_word(self, box, word):
self.built_text[-1] += u" " + word
def end_line(self):
self.built_text[-1] = self.built_text[-1].strip()
def get_output(self):
return u"\n".join(self.built_text)
@staticmethod
def __str__():
return "Raw text"
......@@ -448,7 +459,7 @@ class WordBoxBuilder(object):
cuneiform_args = ["-f", "hocr"]
def __init__(self):
pass
self.word_boxes = []
def read_file(self, file_descriptor):
"""
......@@ -487,6 +498,18 @@ class WordBoxBuilder(object):
file_descriptor.write(xml_str + to_unicode("<br/>\n"))
file_descriptor.write(to_unicode("</body>\n"))
def start_line(self, box):
pass
def add_word(self, box, word):
self.word_boxes.append(Box(word, box))
def end_line(self):
pass
def get_output(self):
return self.word_boxes
@staticmethod
def __str__():
return "Word boxes"
......@@ -503,7 +526,8 @@ class LineBoxBuilder(object):
cuneiform_args = ["-f", "hocr"]
def __init__(self):
pass
self.current_line = None
self.lines = []
def read_file(self, file_descriptor):
"""
......@@ -548,6 +572,19 @@ class LineBoxBuilder(object):
file_descriptor.write(xml_str + to_unicode("<br/>\n"))
file_descriptor.write(to_unicode("</body>\n"))
def start_line(self, box):
self.current_line = LineBox([], box)
self.lines.append(self.current_line)
def add_word(self, box, word):
self.current_line.word_boxes.append(Box(word, box))
def end_line(self):
pass
def get_output(self):
return self.lines
@staticmethod
def __str__():
return "Line boxes"
......@@ -47,6 +47,10 @@ def detect_orientation(image, lang=None):
)
tesseract_raw.set_image(handle, image)
os = tesseract_raw.detect_os(handle)
if os['confidence'] <= 0:
raise tesseract_raw.TesseractError(
"no script", "no script detected"
)
orientation = {
tesseract_raw.Orientation.PAGE_UP: 0,
tesseract_raw.Orientation.PAGE_RIGHT: 90,
......@@ -75,8 +79,46 @@ def get_available_builders():
def image_to_string(image, lang=None, builder=None):
if builder is None:
builder = builders.TextBuilder()
# TODO
pass
handle = tesseract_raw.init(lang=lang)
lvl_line = tesseract_raw.PageIteratorLevel.TEXTLINE
lvl_word = tesseract_raw.PageIteratorLevel.WORD
try:
tesseract_raw.set_image(handle, image)
# XXX(JFlesch): PageIterator and ResultIterator are actually the
# very same thing. If it changes, we are screwed.
tesseract_raw.recognize(handle)
res_iterator = tesseract_raw.get_iterator(handle)
if res_iterator is None:
raise tesseract_raw.TesseractError(
"no script", "no script detected"
)
page_iterator = tesseract_raw.result_iterator_get_page_iterator(
res_iterator
)
while tesseract_raw.page_iterator_next(page_iterator, lvl_line):
(r, box) = tesseract_raw.page_iterator_bounding_box(
page_iterator, lvl_line
)
assert(r)
builder.start_line(box)
while tesseract_raw.page_iterator_next(page_iterator, lvl_word):
word = tesseract_raw.result_iterator_get_utf8_text(
res_iterator, lvl_word
)
(r, box) = tesseract_raw.page_iterator_bounding_box(
page_iterator, lvl_word
)
assert(r)
builder.add_word(box, word)
builder.end_line()
finally:
tesseract_raw.cleanup(handle)
return builder.get_output()
def is_available():
......
......@@ -62,6 +62,31 @@ class Orientation(object):
PAGE_LEFT = 3
class PageIteratorLevel(object):
BLOCK = 0
PARA = 1
TEXTLINE = 2
WORD = 3
SYMBOL = 4
class PolyBlockType(object):
UNKNOWN = 0
FLOWING_TEXT = 1
HEADING_TEXT = 2
PULLOUT_TEXT = 3
TABLE = 4
VERTICAL_TEXT = 5
CAPTION_TEXT = 6
FLOWING_IMAGE = 7
HEADING_IMAGE = 8
PULLOUT_IMAGE = 9
HORZ_LINE = 10
VERT_LINE = 11
NOISE = 12
COUNT = 13
class OSResults(ctypes.Structure):
_fields_ = [
("orientations", ctypes.c_float * 4),
......@@ -121,29 +146,35 @@ if g_libtesseract:
]
g_libtesseract.TessBaseAPISetImage.restype = None
g_libtesseract.TessBaseAPIRecognize.argstypes = [
g_libtesseract.TessBaseAPIRecognize.argtypes = [
ctypes.c_void_p, # TessBaseAPI*
ctypes.c_void_p, # ETEXT_DESC*
]
g_libtesseract.TessBaseAPIRecognize.restype = ctypes.c_int
g_libtesseract.TessBaseAPIAnalyseLayout.argstypes = [
g_libtesseract.TessBaseAPIGetIterator.argtypes = [
ctypes.c_void_p, # TessBaseAPI*
]
g_libtesseract.TessBaseAPIGetIterator.restype = \
ctypes.c_void_p # TessResultIterator
g_libtesseract.TessBaseAPIAnalyseLayout.argtypes = [
ctypes.c_void_p, # TessBaseAPI*
]
g_libtesseract.TessBaseAPIAnalyseLayout.restype = \
ctypes.c_void_p # TessPageIterator*
g_libtesseract.TessBaseAPIGetUTF8Text.argstype = [
g_libtesseract.TessBaseAPIGetUTF8Text.argtypes = [
ctypes.c_void_p, # TessBaseAPI*
]
g_libtesseract.TessBaseAPIGetUTF8Text.restype = ctypes.c_char_p
g_libtesseract.TessPageIteratorDelete.argstypes = [
g_libtesseract.TessPageIteratorDelete.argtypes = [
ctypes.c_void_p, # TessPageIterator*
]
g_libtesseract.TessPageIteratorDelete.restype = None
g_libtesseract.TessPageIteratorOrientation.argstype = [
g_libtesseract.TessPageIteratorOrientation.argtypes = [
ctypes.c_void_p, # TessPageIterator*
ctypes.POINTER(ctypes.c_int), # TessOrientation*
ctypes.POINTER(ctypes.c_int), # TessWritingDirection*
......@@ -152,7 +183,60 @@ if g_libtesseract:
]
g_libtesseract.TessPageIteratorOrientation.restype = None
g_libtesseract.TessBaseAPIDetectOS.argstype = [
g_libtesseract.TessPageIteratorNext.argtypes = [
ctypes.c_void_p, # TessPageIterator*
ctypes.c_int, # TessPageIteratorLevel
]
g_libtesseract.TessPageIteratorNext.restype = ctypes.c_bool
g_libtesseract.TessPageIteratorIsAtBeginningOf.argtypes = [
ctypes.c_void_p, # TessPageIterator*
ctypes.c_int, # TessPageIteratorLevel
]
g_libtesseract.TessPageIteratorIsAtBeginningOf.restype = ctypes.c_bool
g_libtesseract.TessPageIteratorIsAtFinalElement.argtypes = [
ctypes.c_void_p, # TessPageIterator*
ctypes.c_int, # TessPageIteratorLevel (level)
ctypes.c_int, # TessPageIteratorLevel (element)
]
g_libtesseract.TessPageIteratorIsAtFinalElement.restype = ctypes.c_bool
g_libtesseract.TessPageIteratorBlockType.argtypes = [
ctypes.c_void_p, # TessPageIterator*
]
g_libtesseract.TessPageIteratorBlockType.restype = \
ctypes.c_int # PolyBlockType
g_libtesseract.TessPageIteratorBoundingBox.args = [
ctypes.c_void_p, # TessPageIterator*
ctypes.c_int, # TessPageIteratorLevel (level)
ctypes.POINTER(ctypes.c_int), # left
ctypes.POINTER(ctypes.c_int), # top
ctypes.POINTER(ctypes.c_int), # right
ctypes.POINTER(ctypes.c_int), # bottom
]
g_libtesseract.TessPageIteratorBoundingBox.restype = ctypes.c_bool
g_libtesseract.TessResultIteratorGetPageIterator.argtypes = [
ctypes.c_void_p, # TessResultIterator*
]
g_libtesseract.TessResultIteratorGetPageIterator.restype = \
ctypes.c_void_p # TessPageIterator*
g_libtesseract.TessResultIteratorGetUTF8Text.argtypes = [
ctypes.c_void_p, # TessResultIterator*
ctypes.c_int, # TessPageIteratorLevel (level)
]
g_libtesseract.TessResultIteratorGetUTF8Text.restype = \
ctypes.c_char_p # TessPageIterator*
g_libtesseract.TessDeleteText.argtypes = [
ctypes.c_char_p
]
g_libtesseract.TessDeleteText.restype = None
g_libtesseract.TessBaseAPIDetectOS.argtypes = [
ctypes.c_void_p, # TessBaseAPI*
ctypes.POINTER(OSResults),
]
......@@ -262,7 +346,10 @@ def analyse_layout(handle):
def get_utf8_text(handle):
return g_libtesseract.TessBaseAPIGetUTF8Text(handle).decode("utf-8")
txt = g_libtesseract.TessBaseAPIGetUTF8Text(handle)
val = txt.value.decode("utf-8")
g_libtesseract.TessBaseAPIDeleteText(val)
return val
def page_iterator_delete(iterator):
......@@ -272,6 +359,60 @@ def page_iterator_delete(iterator):
return g_libtesseract.TessPageIteratorDelete(iterator)
def page_iterator_next(iterator, level):
global g_libtesseract
assert(g_libtesseract)
return g_libtesseract.TessPageIteratorNext(iterator, level)
def page_iterator_is_at_beginning_of(iterator, level):
global g_libtesseract
assert(g_libtesseract)
return g_libtesseract.TessPageIteratorIsAtBeginningOf(iterator, level)
def page_iterator_is_at_final_element(iterator, level, element):
global g_libtesseract
assert(g_libtesseract)
return g_libtesseract.TessPageIteratorIsAtFinalElement(
iterator, level, element
)
def page_iterator_block_type(iterator):
global g_libtesseract
assert(g_libtesseract)
return g_libtesseract.TessPageIteratorBlockType(
iterator
)
def page_iterator_bounding_box(iterator, level):
global g_libtesseract
assert(g_libtesseract)
left = ctypes.c_int(0)
top = ctypes.c_int(0)
right = ctypes.c_int(0)
bottom = ctypes.c_int(0)
r = g_libtesseract.TessPageIteratorBoundingBox(
iterator,
level,
ctypes.pointer(left),
ctypes.pointer(top),
ctypes.pointer(right),
ctypes.pointer(bottom)
)
if not r:
return (False, (0, 0, 0, 0))
return (True, (left.value, top.value, right.value, bottom.value))
def page_iterator_orientation(iterator):
global g_libtesseract
assert(g_libtesseract)
......@@ -297,6 +438,27 @@ def page_iterator_orientation(iterator):
}
def get_iterator(handle):
global g_libtesseract
assert(g_libtesseract)
return g_libtesseract.TessBaseAPIGetIterator(handle)
def result_iterator_get_page_iterator(res_iterator):
global g_libtesseract
assert(g_libtesseract)
return g_libtesseract.TessResultIteratorGetPageIterator(res_iterator)
def result_iterator_get_utf8_text(iterator, level):
txt = g_libtesseract.TessBaseAPIResultIteratorGetUTF8Text(iterator, level)
val = txt.value.decode("utf-8")
g_libtesseract.TessBaseAPIDeleteText(val)
return val
def detect_os(handle):
global g_libtesseract
assert(g_libtesseract)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment