Commit eae12f25 authored by Jerome Flesch's avatar Jerome Flesch

Tesseract C-API: Fix the way lines and words are obtained from libtesseract

Signed-off-by: Jerome Flesch's avatarJerome Flesch <jflesch@gmail.com>
parent ecabf0f8
......@@ -112,27 +112,32 @@ def image_to_string(image, lang=None, builder=None):
)
while True:
(r, box) = tesseract_raw.page_iterator_bounding_box(
page_iterator, lvl_line
)
assert(r)
box = _tess_box_to_pyocr_box(box)
builder.start_line(box)
while True:
word = tesseract_raw.result_iterator_get_utf8_text(
res_iterator, lvl_word
)
if tesseract_raw.page_iterator_is_at_beginning_of(
page_iterator, lvl_line):
(r, box) = tesseract_raw.page_iterator_bounding_box(
page_iterator, lvl_word
page_iterator, lvl_line
)
assert(r)
box = _tess_box_to_pyocr_box(box)
builder.add_word(word, box)
if not tesseract_raw.page_iterator_next(page_iterator, lvl_word):
break
builder.end_line()
if not tesseract_raw.page_iterator_next(page_iterator, lvl_line):
builder.start_line(box)
word = tesseract_raw.result_iterator_get_utf8_text(
res_iterator, lvl_word
)
(r, box) = tesseract_raw.page_iterator_bounding_box(
page_iterator, lvl_word
)
assert(r)
box = _tess_box_to_pyocr_box(box)
builder.add_word(word, box)
if tesseract_raw.page_iterator_is_at_final_element(
page_iterator, lvl_line, lvl_word):
builder.end_line()
if not tesseract_raw.page_iterator_next(page_iterator, lvl_word):
break
finally:
tesseract_raw.cleanup(handle)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment