Commit 29662f96 authored by Thomas Perret's avatar Thomas Perret

Add tests for boxes

parent cf90132e
......@@ -14,6 +14,8 @@ except ImportError:
import xml.dom.minidom
import logging
import six
from .util import to_unicode
logger = logging.getLogger(__name__)
......@@ -38,6 +40,7 @@ _XHTML_HEADER = to_unicode("""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
""")
@six.python_2_unicode_compatible
class Box(object):
"""
Boxes are rectangles around each individual element recognized in the
......@@ -53,25 +56,10 @@ class Box(object):
tuple of tuple:
((box_pt_min_x, box_pt_min_y), (box_pt_max_x, box_pt_max_y))
"""
content = to_unicode(content)
self.content = content
self.position = position
self.confidence = confidence
def get_unicode_string(self):
"""
Return the string corresponding to the box, in unicode (utf8).
This string can be stored in a file as-is (see write_box_file())
and reread using read_box_file().
"""
return to_unicode("%s %d %d %d %d") % (
self.content,
self.position[0][0],
self.position[0][1],
self.position[1][0],
self.position[1][1],
)
def get_xml_tag(self, parent_doc):
span_tag = parent_doc.createElement("span")
span_tag.setAttribute("class", "ocrx_word")
......@@ -85,13 +73,19 @@ class Box(object):
return span_tag
def __str__(self):
return self.get_unicode_string().encode('utf-8')
return u"{} {} {} {} {}".format(
self.content,
self.position[0][0],
self.position[0][1],
self.position[1][0],
self.position[1][1],
)
def __box_cmp(self, other):
"""
Comparison function.
"""
if other is None:
if other is None or getattr(other, "position", None) is None:
return -1
for (x, y) in ((self.position[0][1], other.position[0][1]),
(self.position[1][1], other.position[1][1]),
......@@ -130,6 +124,7 @@ class Box(object):
return (position_hash ^ hash(self.content) ^ hash(self.content))
@six.python_2_unicode_compatible
class LineBox(object):
"""
Boxes are rectangles around each individual element recognized in the
......@@ -147,54 +142,52 @@ class LineBox(object):
self.word_boxes = word_boxes
self.position = position
def get_unicode_string(self):
"""
Return the string corresponding to the box, in unicode (utf8).
This string can be stored in a file as-is (see write_box_file())
and reread using read_box_file().
"""
txt = to_unicode("[\n")
for box in self.word_boxes:
txt += to_unicode(" %s\n") % box.get_unicode_string()
return to_unicode("%s] %d %d %d %d") % (
txt,
self.position[0][0],
self.position[0][1],
self.position[1][0],
self.position[1][1],
)
def __get_content(self):
txt = to_unicode("")
@property
def content(self):
txt = u""
for box in self.word_boxes:
txt += box.content + to_unicode(" ")
txt += box.content + u" "
txt = txt.strip()
return txt
content = property(__get_content)
def get_xml_tag(self, parent_doc):
span_tag = parent_doc.createElement("span")
span_tag.setAttribute("class", "ocr_line")
span_tag.setAttribute("title", ("bbox %d %d %d %d" % (
(self.position[0][0], self.position[0][1],
self.position[1][0], self.position[1][1]))))
for box in self.word_boxes:
space = xml.dom.minidom.Text()
space.data = " "
span_tag.appendChild(space)
for box_idx, box in enumerate(self.word_boxes):
if box_idx:
space = xml.dom.minidom.Text()
space.data = " "
span_tag.appendChild(space)
box_xml = box.get_xml_tag(parent_doc)
span_tag.appendChild(box_xml)
return span_tag
def __str__(self):
return self.get_unicode_string().encode('utf-8')
txt = u"[\n"
for box in self.word_boxes:
txt += u" {} {} {} {} {}\n".format(
box.content,
box.position[0][0],
box.position[0][1],
box.position[1][0],
box.position[1][1],
)
return u"{}] {} {} {} {}".format(
txt,
self.position[0][0],
self.position[0][1],
self.position[1][0],
self.position[1][1],
)
def __box_cmp(self, other):
"""
Comparison function.
"""
if other is None:
if other is None or getattr(other, "position", None) is None:
return -1
for (x, y) in ((self.position[0][1], other.position[0][1]),
(self.position[1][1], other.position[1][1]),
......@@ -231,7 +224,7 @@ class LineBox(object):
position_hash += ((self.position[0][1] & 0xFF) << 8)
position_hash += ((self.position[1][0] & 0xFF) << 16)
position_hash += ((self.position[1][1] & 0xFF) << 24)
return (position_hash ^ hash(content) ^ hash(content))
return (position_hash ^ hash(content))
class BaseBuilder(object):
......
import unittest
import xml.dom.minidom
from pyocr import builders
class TestBox(unittest.TestCase):
"""
These tests ensure the Box features are what they should.
"""
def setUp(self):
self.box1 = builders.Box("word1", ((15, 22), (23, 42)))
self.box1_bis = builders.Box("word1_bis", ((15, 22), (23, 42)))
self.box2 = builders.Box("word2", ((30, 5), (40, 15)), 95)
def test_init(self):
self.assertEqual(self.box1.content, "word1")
self.assertSequenceEqual(self.box1.position, ((15, 22), (23, 42)))
self.assertEqual(self.box1.confidence, 0)
self.assertEqual(self.box2.confidence, 95)
def test_get_xml_tag(self):
impl = xml.dom.minidom.getDOMImplementation()
doc = impl.createDocument(None, "root", None)
tag = self.box1.get_xml_tag(doc)
self.assertEqual(len(tag.childNodes), 1)
self.assertEqual(tag.getAttribute("class"), "ocrx_word")
self.assertEqual(tag.getAttribute("title"),
"bbox 15 22 23 42; x_wconf 0")
self.assertEqual(tag.firstChild.data, "word1")
def test_str_method(self):
self.assertEqual(str(self.box1), "word1 15 22 23 42")
def test_box_not_equal_None(self):
self.assertNotEqual(self.box1, None)
def test_box_equal(self):
self.assertEqual(self.box1, self.box1_bis)
def test_box_not_equal(self):
self.assertNotEqual(self.box1, self.box2)
def test_box_lower(self):
self.assertLess(self.box2, self.box1)
self.assertLessEqual(self.box2, self.box1)
def test_box_greater(self):
self.assertGreater(self.box1, self.box2)
self.assertGreaterEqual(self.box1, self.box2)
def test_box_equal_not_box(self):
self.assertNotEqual(self.box1, 0)
self.assertNotEqual(self.box1, [])
def test_hash(self):
self.assertEqual(hash(self.box1), hash(self.box1_bis))
self.assertNotEqual(hash(self.box1), hash(self.box2))
class TestLineBox(unittest.TestCase):
def setUp(self):
box1 = builders.Box("word1", ((15, 22), (23, 30)))
box2 = builders.Box("word2", ((25, 23), (30, 32)))
box3 = builders.Box("word3", ((32, 25), (40, 32)), 95)
box4 = builders.Box("word4", ((41, 18), (44, 33)), 98)
self.line1 = builders.LineBox(
[box1, box2, box3, box4],
((14, 15), (45, 33))
)
self.line1_bis = builders.LineBox(
[box1, box2],
((14, 15), (45, 33))
)
self.line2 = builders.LineBox(
[box3, box4],
((30, 5), (53, 20))
)
self.line1_dupl = builders.LineBox(
[box1, box2, box3, box4],
((14, 15), (45, 33))
)
def test_init(self):
self.assertEqual(len(self.line1.word_boxes), 4)
self.assertSequenceEqual(self.line1.position, ((14, 15), (45, 33)))
self.assertEqual(self.line1.content, "word1 word2 word3 word4")
# @unittest.skip("TODO: fix get_xml_tag method in LineBox")
def test_get_xml_tag(self):
impl = xml.dom.minidom.getDOMImplementation()
doc = impl.createDocument(None, "root", None)
tag = self.line1.get_xml_tag(doc)
self.assertEqual(len(tag.childNodes),
2 * len(self.line1.word_boxes) - 1)
self.assertEqual(tag.getAttribute("class"), "ocr_line")
self.assertEqual(tag.getAttribute("title"), "bbox 14 15 45 33")
self.assertEqual(tag.firstChild.firstChild.data, "word1")
self.assertEqual(tag.lastChild.firstChild.data, "word4")
def test_line_str(self):
output = str(self.line1)
expected = "[\n"
for box in self.line1.word_boxes:
expected += " " + box.__str__() + "\n"
expected += "] 14 15 45 33"
self.assertEqual(output, expected)
def test_line_not_equal_None(self):
self.assertNotEqual(self.line1, None)
def test_box_equal(self):
self.assertEqual(self.line1, self.line1_bis)
def test_box_not_equal(self):
self.assertNotEqual(self.line1, self.line2)
def test_line_lower(self):
self.assertLess(self.line2, self.line1)
self.assertLessEqual(self.line2, self.line1)
def test_line_greater(self):
self.assertGreater(self.line1, self.line2)
self.assertGreaterEqual(self.line1, self.line2)
def test_hash(self):
self.assertIsNotNone(hash(self.line1))
self.assertNotEqual(hash(self.line1), hash(self.line1_bis))
self.assertNotEqual(hash(self.line1), hash(self.line2))
self.assertEqual(hash(self.line1), hash(self.line1_dupl))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment