Commit a6e06b4c authored by David Martin's avatar David Martin

Merge branch 'master' of github.com:openpaperwork/pyocr into update_deprecated_psm_option_string

parents cc3c0b9a 410d37d9
......@@ -280,7 +280,8 @@ Beware this code hasn't been adapted to libtesseract 3 yet.
## Tests
```sh
python ./run_tests.py
make check # requires pyflake8
make test # requires tox
```
Tests are made to be run with the latest versions of Tesseract and Cuneiform.
......
......@@ -483,6 +483,9 @@ class _LineHTMLParser(HTMLParser):
line, the position of all its characters.
Spaces have "-1 -1 -1 -1" for position".
"""
TAG_TYPE_CONTENT = 0
TAG_TYPE_POSITIONS = 1
def __init__(self):
HTMLParser.__init__(self)
self.boxes = []
......@@ -490,24 +493,21 @@ class _LineHTMLParser(HTMLParser):
self.__char_positions = None
def handle_starttag(self, tag, attrs):
TAG_TYPE_CONTENT = 0
TAG_TYPE_POSITIONS = 1
if (tag != "span"):
return
tag_type = -1
for attr in attrs:
if attr[0] == 'class':
if attr[1] == 'ocr_line':
tag_type = TAG_TYPE_CONTENT
tag_type = self.TAG_TYPE_CONTENT
elif attr[1] == 'ocr_cinfo':
tag_type = TAG_TYPE_POSITIONS
tag_type = self.TAG_TYPE_POSITIONS
if tag_type == TAG_TYPE_CONTENT:
if tag_type == self.TAG_TYPE_CONTENT:
self.__line_text = to_unicode("")
self.__char_positions = []
return
elif tag_type == TAG_TYPE_POSITIONS:
elif tag_type == self.TAG_TYPE_POSITIONS:
for attr in attrs:
if attr[0] == 'title':
self.__char_positions = attr[1].split(" ")
......
......@@ -359,7 +359,7 @@ def init(lang=None):
b"tessedit_zero_rejection",
b"F"
)
except:
except: # noqa: E722
g_libtesseract.TessBaseAPIDelete(ctypes.c_void_p(handle))
raise
return handle
......
......@@ -17,7 +17,7 @@ def digits_only(string):
def to_unicode(string):
try:
return six.u(string)
except:
except: # noqa: E722
# probably already decoded
return string
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment