Commit b95bdbcc authored by David Martin's avatar David Martin

Determine the psm parameter based on the Tesseract version.

It turns out that for versions before the current 4 beta only '-psm' is
allowed, and the latest build only allows '--psm'.
parent c136838b
......@@ -14,6 +14,7 @@ except ImportError:
import xml.dom.minidom
import logging
from .tesseract import psm_parameter
from .util import to_unicode
logger = logging.getLogger(__name__)
......@@ -305,7 +306,7 @@ class TextBuilder(BaseBuilder):
def __init__(self, tesseract_layout=3, cuneiform_dotmatrix=False,
cuneiform_fax=False, cuneiform_singlecolumn=False):
file_ext = ["txt"]
tess_flags = ["--psm", str(tesseract_layout)]
tess_flags = [psm_parameter(), str(tesseract_layout)]
cun_args = ["-f", "text"]
# Add custom cuneiform parameters if needed
for par, arg in [(cuneiform_dotmatrix, "--dotmatrix"),
......@@ -562,7 +563,7 @@ class WordBoxBuilder(BaseBuilder):
def __init__(self, tesseract_layout=1):
file_ext = ["html", "hocr"]
tess_flags = ["--psm", str(tesseract_layout)]
tess_flags = [psm_parameter(), str(tesseract_layout)]
tess_conf = ["hocr"]
cun_args = ["-f", "hocr"]
super(WordBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
......@@ -638,7 +639,7 @@ class LineBoxBuilder(BaseBuilder):
def __init__(self, tesseract_layout=1):
file_ext = ["html", "hocr"]
tess_flags = ["--psm", str(tesseract_layout)]
tess_flags = [psm_parameter(), str(tesseract_layout)]
tess_conf = ["hocr"]
cun_args = ["-f", "hocr"]
super(LineBoxBuilder, self).__init__(file_ext, tess_flags, tess_conf,
......
......@@ -161,6 +161,15 @@ def can_detect_orientation():
)
def psm_parameter():
"""Return the psm option string depending on the Tesseract version."""
version = get_version()
if version[0] <= 3:
return "-psm"
return "--psm"
def detect_orientation(image, lang=None):
"""
Arguments:
......@@ -178,7 +187,7 @@ def detect_orientation(image, lang=None):
"""
_set_environment()
with temp_dir() as tmpdir:
command = [TESSERACT_CMD, "input.bmp", 'stdout', "--psm", "0"]
command = [TESSERACT_CMD, "input.bmp", 'stdout', psm_parameter(), "0"]
version = get_version()
if version[0] >= 4:
# XXX: temporary fix to remove once Tesseract 4 is stable
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment