Commit 199e4c5c authored by Thomas Perret's avatar Thomas Perret

Put back get_unicode_string for python2 support

parent e37af8fa
......@@ -60,6 +60,20 @@ class Box(object):
self.position = position
self.confidence = confidence
def get_unicode_string(self):
"""
Return the string corresponding to the box, in unicode (utf8).
This string can be stored in a file as-is (see write_box_file())
and reread using read_box_file().
"""
return to_unicode("%s %d %d %d %d") % (
self.content,
self.position[0][0],
self.position[0][1],
self.position[1][0],
self.position[1][1],
)
def get_xml_tag(self, parent_doc):
span_tag = parent_doc.createElement("span")
span_tag.setAttribute("class", "ocrx_word")
......@@ -73,13 +87,7 @@ class Box(object):
return span_tag
def __str__(self):
return u"{} {} {} {} {}".format(
self.content,
self.position[0][0],
self.position[0][1],
self.position[1][0],
self.position[1][1],
)
return self.get_unicode_string()
def __box_cmp(self, other):
"""
......@@ -150,6 +158,23 @@ class LineBox(object):
txt = txt.strip()
return txt
def get_unicode_string(self):
"""
Return the string corresponding to the box, in unicode (utf8).
This string can be stored in a file as-is (see write_box_file())
and reread using read_box_file().
"""
txt = to_unicode("[\n")
for box in self.word_boxes:
txt += to_unicode(" %s\n") % box.get_unicode_string()
return to_unicode("%s] %d %d %d %d") % (
txt,
self.position[0][0],
self.position[0][1],
self.position[1][0],
self.position[1][1],
)
def get_xml_tag(self, parent_doc):
span_tag = parent_doc.createElement("span")
span_tag.setAttribute("class", "ocr_line")
......@@ -166,22 +191,7 @@ class LineBox(object):
return span_tag
def __str__(self):
txt = u"[\n"
for box in self.word_boxes:
txt += u" {} {} {} {} {}\n".format(
box.content,
box.position[0][0],
box.position[0][1],
box.position[1][0],
box.position[1][1],
)
return u"{}] {} {} {} {}".format(
txt,
self.position[0][0],
self.position[0][1],
self.position[1][0],
self.position[1][1],
)
return self.get_unicode_string()
def __box_cmp(self, other):
"""
......
......@@ -106,7 +106,7 @@ class CharBoxBuilder(builders.BaseBuilder):
The file_descriptor must support UTF-8 ! (see module 'codecs')
"""
for box in boxes:
file_descriptor.write(to_unicode(str(box)) + " 0\n")
file_descriptor.write(box.get_unicode_string() + " 0\n")
def __str__(self):
return "Character boxes"
......
import sys
import unittest
import xml.dom.minidom
......@@ -12,6 +13,7 @@ class TestBox(unittest.TestCase):
self.box1 = builders.Box("word1", ((15, 22), (23, 42)))
self.box1_bis = builders.Box("word1_bis", ((15, 22), (23, 42)))
self.box2 = builders.Box("word2", ((30, 5), (40, 15)), 95)
self.box_unicode = builders.Box(u"\xe9", ((1, 2), (3, 4)))
def test_init(self):
self.assertEqual(self.box1.content, "word1")
......@@ -29,9 +31,21 @@ class TestBox(unittest.TestCase):
"bbox 15 22 23 42; x_wconf 0")
self.assertEqual(tag.firstChild.data, "word1")
def test_get_unicode_string(self):
self.assertEqual(self.box_unicode.get_unicode_string(),
u"\xe9 1 2 3 4")
def test_str_method(self):
self.assertEqual(str(self.box1), "word1 15 22 23 42")
@unittest.skipUnless(sys.version_info < (3, 0), "python2 box str")
def test_str_python2(self):
self.assertEqual(str(self.box_unicode), u"\xe9 1 2 3 4".encode("utf-8"))
@unittest.skipIf(sys.version_info < (3, 0), "python3 box str")
def test_str_python3(self):
self.assertEqual(str(self.box_unicode), "\xe9 1 2 3 4")
def test_box_not_equal_None(self):
self.assertNotEqual(self.box1, None)
......@@ -65,6 +79,7 @@ class TestLineBox(unittest.TestCase):
box2 = builders.Box("word2", ((25, 23), (30, 32)))
box3 = builders.Box("word3", ((32, 25), (40, 32)), 95)
box4 = builders.Box("word4", ((41, 18), (44, 33)), 98)
box_unicode = builders.Box(u"\xe9", ((1, 2), (3, 4)), 98)
self.line1 = builders.LineBox(
[box1, box2, box3, box4],
((14, 15), (45, 33))
......@@ -81,13 +96,16 @@ class TestLineBox(unittest.TestCase):
[box1, box2, box3, box4],
((14, 15), (45, 33))
)
self.line_unicode = builders.LineBox(
[box1, box_unicode],
((1, 2), (3, 4))
)
def test_init(self):
self.assertEqual(len(self.line1.word_boxes), 4)
self.assertSequenceEqual(self.line1.position, ((14, 15), (45, 33)))
self.assertEqual(self.line1.content, "word1 word2 word3 word4")
# @unittest.skip("TODO: fix get_xml_tag method in LineBox")
def test_get_xml_tag(self):
impl = xml.dom.minidom.getDOMImplementation()
doc = impl.createDocument(None, "root", None)
......@@ -99,13 +117,30 @@ class TestLineBox(unittest.TestCase):
self.assertEqual(tag.firstChild.firstChild.data, "word1")
self.assertEqual(tag.lastChild.firstChild.data, "word4")
def test_get_unicode_string(self):
self.assertEqual(self.line_unicode.get_unicode_string(),
u"[\n word1 15 22 23 30\n \xe9 1 2 3 4\n] 1 2 3 4")
def test_line_str(self):
output = str(self.line1)
expected = "[\n"
for box in self.line1.word_boxes:
expected += " " + box.__str__() + "\n"
expected += "] 14 15 45 33"
self.assertEqual(output, expected)
self.assertEqual(str(self.line1), expected)
@unittest.skipUnless(sys.version_info < (3, 0), "python2 line str")
def test_str_python2(self):
self.assertEqual(
str(self.line_unicode),
u"[\n word1 15 22 23 30\n \xe9 1 2 3 4\n] 1 2 3 4".encode("utf-8")
)
@unittest.skipIf(sys.version_info < (3, 0), "python3 line str")
def test_str_python3(self):
self.assertEqual(
str(self.line_unicode),
"[\n word1 15 22 23 30\n \xe9 1 2 3 4\n] 1 2 3 4"
)
def test_line_not_equal_None(self):
self.assertNotEqual(self.line1, None)
......
......@@ -848,13 +848,14 @@ class TestCharBoxBuilder(BaseTest):
builders.Box("b", ((11, 12), (13, 14))),
builders.Box("c", ((12, 13), (14, 15))),
builders.Box("d", ((13, 14), (15, 16)), 87),
builders.Box(u"\xe9", ((14, 15), (16, 17)), 88),
]
builder.write_file(output, boxes)
output.seek(0)
output = output.read()
for box in boxes:
self.assertIn(box.content, output)
self.assertIn("{} {} {} {}".format(
self.assertIn(u"{} {} {} {}".format(
box.position[0][0], box.position[0][1],
box.position[1][0], box.position[1][1],
), output)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment