Commit e48a56a7 authored by Jerome Flesch's avatar Jerome Flesch

Tesseract C-API: Align Tesseract configuration between Tesseract C-API and Tesseract SH

Signed-off-by: Jerome Flesch's avatarJerome Flesch <jflesch@gmail.com>
parent ff93d999
......@@ -71,7 +71,7 @@ class Box(object):
(self.position[0][0], self.position[0][1],
self.position[1][0], self.position[1][1]))))
txt = xml.dom.minidom.Text()
txt.data = self.content.encode('utf-8')
txt.data = self.content
span_tag.appendChild(txt)
return span_tag
......@@ -241,7 +241,7 @@ class TextBuilder(object):
def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False,
cuneiform_fax=False, cuneiform_singlecolumn=False):
self.tesseract_configs = ["-psm", str(tesseract_layout)]
self.tesseract_configs += ["-psm", str(tesseract_layout)]
self.tesseract_layout = tesseract_layout
# Add custom cuneiform parameters if needed
if cuneiform_dotmatrix:
......@@ -270,10 +270,12 @@ class TextBuilder(object):
self.built_text.append(u"")
def add_word(self, word, box):
self.built_text[-1] += u" " + word
if self.built_text[-1] != u"":
self.built_text[-1] += u" "
self.built_text[-1] += word
def end_line(self):
self.built_text[-1] = self.built_text[-1].strip()
pass
def get_output(self):
return u"\n".join(self.built_text)
......@@ -459,9 +461,10 @@ class WordBoxBuilder(object):
tesseract_configs = ['hocr']
cuneiform_args = ["-f", "hocr"]
def __init__(self):
def __init__(self, tesseract_layout=1):
self.word_boxes = []
self.tesseract_layout = 3
self.tesseract_layout = tesseract_layout
self.tesseract_configs += ["-psm", str(tesseract_layout)]
def read_file(self, file_descriptor):
"""
......@@ -527,10 +530,11 @@ class LineBoxBuilder(object):
tesseract_configs = ['hocr']
cuneiform_args = ["-f", "hocr"]
def __init__(self):
def __init__(self, tesseract_layout=1):
self.current_line = None
self.lines = []
self.tesseract_layout = 3
self.tesseract_layout = tesseract_layout
self.tesseract_configs += ["-psm", str(tesseract_layout)]
def read_file(self, file_descriptor):
"""
......
......@@ -91,10 +91,9 @@ def image_to_string(image, lang=None, builder=None):
lvl_word = tesseract_raw.PageIteratorLevel.WORD
try:
if builder.tesseract_layout != tesseract_raw.PageSegMode.AUTO:
tesseract_raw.set_page_seg_mode(
handle, builder.tesseract_layout
)
tesseract_raw.set_page_seg_mode(
handle, builder.tesseract_layout
)
tesseract_raw.set_image(handle, image)
......
......@@ -112,6 +112,16 @@ if g_libtesseract:
]
g_libtesseract.TessBaseAPIDelete.argtypes = None
g_libtesseract.TessBaseAPIInit1.argtypes = [
ctypes.c_void_p, # TessBaseAPI*
ctypes.c_char_p, # datapath
ctypes.c_char_p, # language
ctypes.c_int, # TessOcrEngineMode
ctypes.POINTER(ctypes.c_char_p), # configs
ctypes.c_int, # configs_size
]
g_libtesseract.TessBaseAPIInit1.restypes = ctypes.c_int
g_libtesseract.TessBaseAPIInit3.argtypes = [
ctypes.c_void_p, # TessBaseAPI*
ctypes.c_char_p, # datapath
......@@ -119,6 +129,13 @@ if g_libtesseract:
]
g_libtesseract.TessBaseAPIInit3.restype = ctypes.c_int
g_libtesseract.TessBaseAPISetVariable.argtypes = [
ctypes.c_void_p, # TessBaseAPI*
ctypes.c_char_p, # name
ctypes.c_char_p, # value
]
g_libtesseract.TessBaseAPISetVariable.restype = ctypes.c_bool
g_libtesseract.TessBaseAPIGetAvailableLanguagesAsVector.argtypes = [
ctypes.c_void_p # TessBaseAPI*
]
......@@ -257,6 +274,11 @@ def init(lang=None):
ctypes.c_char_p(prefix),
ctypes.c_char_p(lang)
)
g_libtesseract.TessBaseAPISetVariable(
handle,
b"tessedit_zero_rejection",
b"F"
)
except:
g_libtesseract.TessBaseAPIDelete(handle)
raise
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment