Commit e58948ca authored by Jerome Flesch's avatar Jerome Flesch

Tesseract C-API: ignore empty lines

Signed-off-by: Jerome Flesch's avatarJerome Flesch <jflesch@gmail.com>
parent e48a56a7
......@@ -531,7 +531,6 @@ class LineBoxBuilder(object):
cuneiform_args = ["-f", "hocr"]
def __init__(self, tesseract_layout=1):
self.current_line = None
self.lines = []
self.tesseract_layout = tesseract_layout
self.tesseract_configs += ["-psm", str(tesseract_layout)]
......@@ -580,11 +579,10 @@ class LineBoxBuilder(object):
file_descriptor.write(to_unicode("</body>\n"))
def start_line(self, box):
self.current_line = LineBox([], box)
self.lines.append(self.current_line)
self.lines.append(LineBox([], box))
def add_word(self, word, box):
self.current_line.word_boxes.append(Box(word, box))
self.lines[-1].word_boxes.append(Box(word, box))
def end_line(self):
pass
......
......@@ -109,28 +109,34 @@ def image_to_string(image, lang=None, builder=None):
res_iterator
)
was_empty = False
while True:
if tesseract_raw.page_iterator_is_at_beginning_of(
page_iterator, lvl_line):
page_iterator, lvl_line) and not was_empty:
(r, box) = tesseract_raw.page_iterator_bounding_box(
page_iterator, lvl_line
)
assert(r)
box = _tess_box_to_pyocr_box(box)
builder.start_line(box)
was_empty = True
word = tesseract_raw.result_iterator_get_utf8_text(
res_iterator, lvl_word
)
(r, box) = tesseract_raw.page_iterator_bounding_box(
page_iterator, lvl_word
)
assert(r)
box = _tess_box_to_pyocr_box(box)
builder.add_word(word, box)
if word.strip() != "":
(r, box) = tesseract_raw.page_iterator_bounding_box(
page_iterator, lvl_word
)
assert(r)
box = _tess_box_to_pyocr_box(box)
builder.add_word(word, box)
was_empty = False
if tesseract_raw.page_iterator_is_at_final_element(
page_iterator, lvl_line, lvl_word):
page_iterator, lvl_line, lvl_word) and not was_empty:
builder.end_line()
if not tesseract_raw.page_iterator_next(page_iterator, lvl_word):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment