Commit dd05e84d authored by Jerome Flesch's avatar Jerome Flesch

Tesseract C-API: Start fixing getting the box and the text in each boxes.

Remaining:
- box position
- lines are not splitted correctly (we always get one line right now)
Signed-off-by: Jerome Flesch's avatarJerome Flesch <jflesch@gmail.com>
parent 3d19ff34
......@@ -130,7 +130,7 @@ class LineBox(object):
def __init__(self, word_boxes, position):
"""
Arguments:
word_boxes --- a single string
word_boxes --- a list of Box objects
position --- the position of the box on the image. Given as a
tuple of tuple:
((width_pt_x, height_pt_x), (width_pt_y, height_pt_y))
......@@ -268,7 +268,7 @@ class TextBuilder(object):
def start_line(self, box):
self.built_text.append(u"")
def add_word(self, box, word):
def add_word(self, word, box):
self.built_text[-1] += u" " + word
def end_line(self):
......@@ -501,7 +501,7 @@ class WordBoxBuilder(object):
def start_line(self, box):
pass
def add_word(self, box, word):
def add_word(self, word, box):
self.word_boxes.append(Box(word, box))
def end_line(self):
......@@ -576,7 +576,7 @@ class LineBoxBuilder(object):
self.current_line = LineBox([], box)
self.lines.append(self.current_line)
def add_word(self, box, word):
def add_word(self, word, box):
self.current_line.word_boxes.append(Box(word, box))
def end_line(self):
......
......@@ -76,6 +76,13 @@ def get_available_builders():
]
def _tess_box_to_pyocr_box(box):
return (
(box[0], box[1]),
(box[2], box[3]),
)
def image_to_string(image, lang=None, builder=None):
if builder is None:
builder = builders.TextBuilder()
......@@ -99,13 +106,14 @@ def image_to_string(image, lang=None, builder=None):
res_iterator
)
while tesseract_raw.page_iterator_next(page_iterator, lvl_line):
while True:
(r, box) = tesseract_raw.page_iterator_bounding_box(
page_iterator, lvl_line
)
assert(r)
box = _tess_box_to_pyocr_box(box)
builder.start_line(box)
while tesseract_raw.page_iterator_next(page_iterator, lvl_word):
while True:
word = tesseract_raw.result_iterator_get_utf8_text(
res_iterator, lvl_word
)
......@@ -113,8 +121,13 @@ def image_to_string(image, lang=None, builder=None):
page_iterator, lvl_word
)
assert(r)
builder.add_word(box, word)
box = _tess_box_to_pyocr_box(box)
builder.add_word(word, box)
if not tesseract_raw.page_iterator_next(page_iterator, lvl_word):
break
builder.end_line()
if not tesseract_raw.page_iterator_next(page_iterator, lvl_line):
break
finally:
tesseract_raw.cleanup(handle)
......
......@@ -167,7 +167,7 @@ if g_libtesseract:
g_libtesseract.TessBaseAPIGetUTF8Text.argtypes = [
ctypes.c_void_p, # TessBaseAPI*
]
g_libtesseract.TessBaseAPIGetUTF8Text.restype = ctypes.c_char_p
g_libtesseract.TessBaseAPIGetUTF8Text.restype = ctypes.c_void_p
g_libtesseract.TessPageIteratorDelete.argtypes = [
ctypes.c_void_p, # TessPageIterator*
......@@ -229,10 +229,10 @@ if g_libtesseract:
ctypes.c_int, # TessPageIteratorLevel (level)
]
g_libtesseract.TessResultIteratorGetUTF8Text.restype = \
ctypes.c_char_p # TessPageIterator*
ctypes.c_void_p
g_libtesseract.TessDeleteText.argtypes = [
ctypes.c_char_p
ctypes.c_void_p
]
g_libtesseract.TessDeleteText.restype = None
......@@ -346,9 +346,9 @@ def analyse_layout(handle):
def get_utf8_text(handle):
txt = g_libtesseract.TessBaseAPIGetUTF8Text(handle)
val = txt.value.decode("utf-8")
g_libtesseract.TessBaseAPIDeleteText(val)
ptr = g_libtesseract.TessBaseAPIGetUTF8Text(handle)
val = ctypes.cast(ptr, ctypes.c_char_p).value.decode("utf-8")[:]
g_libtesseract.TessDeleteText(ptr)
return val
......@@ -453,9 +453,9 @@ def result_iterator_get_page_iterator(res_iterator):
def result_iterator_get_utf8_text(iterator, level):
txt = g_libtesseract.TessBaseAPIResultIteratorGetUTF8Text(iterator, level)
val = txt.value.decode("utf-8")
g_libtesseract.TessBaseAPIDeleteText(val)
ptr = g_libtesseract.TessResultIteratorGetUTF8Text(iterator, level)
val = ctypes.cast(ptr, ctypes.c_char_p).value.decode("utf-8")[:]
g_libtesseract.TessDeleteText(ptr)
return val
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment