Commit 0d8cc236 authored by Jerome Flesch's avatar Jerome Flesch

Tests: Tesseract: Add tests using japanese characters

Signed-off-by: Jerome Flesch's avatarJerome Flesch <jflesch@gmail.com>
parent 970e7cfe
......@@ -68,7 +68,12 @@ bmp, tiff, and others. It also support bounding box data.
## Tests
Tests are made to be run with the latest versions of Tesseract and Cuneiform.
the first tests verifie that you're using the expected version.
the first tests verify that you're using the expected version.
To run the tests, you will need the following lang support:
- English (tesseract-ocr-eng)
- French (tesseract-ocr-fra)
- Japanese (tesseract-ocr-jpn)
## Copyright
......
ナ 34 668 83 755 0
こ 68 670 119 725 0
し 137 673 179 746 0
丶 187 683 220 741 0
な 231 667 320 755 0
ヵ 331 668 391 754 0
丶 392 695 423 740 0
り 446 667 507 755 0
つ 533 673 617 741 0
お 666 669 754 755 0
れ 763 667 856 754 0
の 866 669 952 748 0
よ 970 667 1047 755 0
め 1066 669 1151 754 0
田 34 515 118 601 0
井 130 514 222 606 0
中 234 514 318 607 0
律 328 514 423 607 0
俺 461 514 557 607 0
の 566 519 652 598 0
嫁 662 514 757 607 0
a 27 415 61 453 0
b 68 414 107 467 0
C 111 414 147 454 0
A 172 415 222 467 0
B 227 415 267 467 0
C 274 414 319 468 0
ぁ 348 413 402 469 0
臓 414 416 441 463 0
丶 446 423 467 461 0
う 480 413 526 469 0
え 539 413 595 469 0
お 603 414 660 469 0
ヵ 666 414 705 469 0
丶 705 431 725 460 0
き 735 413 783 469 0
く 803 412 840 469 0
野 863 415 880 467 0
ナ 879 412 914 468 0
こ 928 416 973 465 0
0 26 318 67 372 0
〇 72 318 120 372 0
ー 133 319 152 371 0
ー 173 319 182 371 0
2 190 319 229 372 0
ー 238 319 247 371 0
3 254 318 293 372 0
B 302 319 342 371 0
4 346 319 389 371 0
5 395 318 433 371 0
S 439 318 479 372 0
6 485 318 524 372 0
7 532 319 570 371 0
8 576 318 616 372 0
9 622 319 661 372 0
F 263 166 348 262 0
U 451 166 533 265 0
N 639 165 723 265 0
ぁ 316 86 351 123 0
っ 361 86 392 112 0
ナ 403 87 427 123 0
こ 418 87 440 111 0
〝 451 98 464 123 0
丿 459 87 479 123 0
ま 491 87 523 123 0
ぇ 534 86 570 123 0
じ 581 87 611 123 0
ゃ 621 84 654 115 0
ん 663 87 701 123 0
! 713 87 723 122 0
r 616 29 634 51 0
i 638 29 644 62 0
t 648 29 669 59 0
s 672 29 696 51 0
u 700 29 723 51 0
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name='ocr-system' content='tesseract 3.02.01' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "../data/test-japanese.jpg"; bbox 0 0 1185 778; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 34 23 1151 264">
<p class='ocr_par' dir='ltr' id='par_1' title="bbox 34 23 1151 264">
<span class='ocr_line' id='line_1' title="bbox 34 23 1151 111"><span class='ocrx_word' id='word_1' title="bbox 34 23 617 111">ナこし丶なヵ丶りつ</span> <span class='ocrx_word' id='word_2' title="bbox 666 23 1151 111">おれのよめ</span>
</span>
<span class='ocr_line' id='line_2' title="bbox 34 171 757 264"><span class='ocrx_word' id='word_3' title="bbox 34 171 423 264">田井中律</span> <span class='ocrx_word' id='word_4' title="bbox 461 171 757 264">俺の嫁</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_2_2' title="bbox 26 309 973 460">
<p class='ocr_par' dir='ltr' id='par_2' title="bbox 26 309 973 460">
<span class='ocr_line' id='line_3' title="bbox 27 309 973 366"><span class='ocrx_word' id='word_5' title="bbox 27 311 147 364">abC</span> <span class='ocrx_word' id='word_6' title="bbox 172 310 319 364">ABC</span> <span class='ocrx_word' id='word_7' title="bbox 348 309 783 365">ぁ臓丶うえおヵ丶き</span> <span class='ocrx_word' id='word_8' title="bbox 803 309 840 366"></span> <span class='ocrx_word' id='word_9' title="bbox 863 310 973 366">野ナこ</span>
</span>
<span class='ocr_line' id='line_4' title="bbox 26 406 661 460"><span class='ocrx_word' id='word_10' title="bbox 26 406 152 460">0〇ー</span> <span class='ocrx_word' id='word_11' title="bbox 173 406 661 460">ー2ー3B45S6789</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_3_3' title="bbox 263 513 723 613">
<p class='ocr_par' dir='ltr' id='par_3' title="bbox 263 513 723 613">
<span class='ocr_line' id='line_5' title="bbox 263 513 723 613"><span class='ocrx_word' id='word_12' title="bbox 263 516 348 612">F</span> <span class='ocrx_word' id='word_13' title="bbox 451 513 533 612">U</span> <span class='ocrx_word' id='word_14' title="bbox 639 513 723 613">N</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_4_4' title="bbox 316 655 723 694">
<p class='ocr_par' dir='ltr' id='par_4' title="bbox 316 655 723 694">
<span class='ocr_line' id='line_6' title="bbox 316 655 723 694"><span class='ocrx_word' id='word_15' title="bbox 316 655 723 694">ぁっナこ〝丿まぇじゃん!</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_5_5' title="bbox 616 716 723 749">
<p class='ocr_par' dir='ltr' id='par_5' title="bbox 616 716 723 749">
<span class='ocr_line' id='line_7' title="bbox 616 716 723 749"><span class='ocrx_word' id='word_16' title="bbox 616 716 723 749">ritsu</span>
</span>
</p>
</div>
</div>
</body>
</html>
ナこし丶なヵ丶りつ おれのよめ
田井中律 俺の嫁
abC ABC ぁ臓丶うえおヵ丶き く 野ナこ
0〇ー ー2ー3B45S6789
F U N
ぁっナこ〝丿まぇじゃん!
ritsu
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name='ocr-system' content='tesseract 3.02.01' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "../data/test-japanese.jpg"; bbox 0 0 1185 778; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 34 23 1151 264">
<p class='ocr_par' dir='ltr' id='par_1' title="bbox 34 23 1151 264">
<span class='ocr_line' id='line_1' title="bbox 34 23 1151 111"><span class='ocrx_word' id='word_1' title="bbox 34 23 617 111">ナこし丶なヵ丶りつ</span> <span class='ocrx_word' id='word_2' title="bbox 666 23 1151 111">おれのよめ</span>
</span>
<span class='ocr_line' id='line_2' title="bbox 34 171 757 264"><span class='ocrx_word' id='word_3' title="bbox 34 171 423 264">田井中律</span> <span class='ocrx_word' id='word_4' title="bbox 461 171 757 264">俺の嫁</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_2_2' title="bbox 26 309 973 460">
<p class='ocr_par' dir='ltr' id='par_2' title="bbox 26 309 973 460">
<span class='ocr_line' id='line_3' title="bbox 27 309 973 366"><span class='ocrx_word' id='word_5' title="bbox 27 311 147 364">abC</span> <span class='ocrx_word' id='word_6' title="bbox 172 310 319 364">ABC</span> <span class='ocrx_word' id='word_7' title="bbox 348 309 783 365">ぁ臓丶うえおヵ丶き</span> <span class='ocrx_word' id='word_8' title="bbox 803 309 840 366"></span> <span class='ocrx_word' id='word_9' title="bbox 863 310 973 366">野ナこ</span>
</span>
<span class='ocr_line' id='line_4' title="bbox 26 406 661 460"><span class='ocrx_word' id='word_10' title="bbox 26 406 152 460">0〇ー</span> <span class='ocrx_word' id='word_11' title="bbox 173 406 661 460">ー2ー3B45S6789</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_3_3' title="bbox 263 513 723 613">
<p class='ocr_par' dir='ltr' id='par_3' title="bbox 263 513 723 613">
<span class='ocr_line' id='line_5' title="bbox 263 513 723 613"><span class='ocrx_word' id='word_12' title="bbox 263 516 348 612">F</span> <span class='ocrx_word' id='word_13' title="bbox 451 513 533 612">U</span> <span class='ocrx_word' id='word_14' title="bbox 639 513 723 613">N</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_4_4' title="bbox 316 655 723 694">
<p class='ocr_par' dir='ltr' id='par_4' title="bbox 316 655 723 694">
<span class='ocr_line' id='line_6' title="bbox 316 655 723 694"><span class='ocrx_word' id='word_15' title="bbox 316 655 723 694">ぁっナこ〝丿まぇじゃん!</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_5_5' title="bbox 616 716 723 749">
<p class='ocr_par' dir='ltr' id='par_5' title="bbox 616 716 723 749">
<span class='ocr_line' id='line_7' title="bbox 616 716 723 749"><span class='ocrx_word' id='word_16' title="bbox 616 716 723 749">ritsu</span>
</span>
</p>
</div>
</div>
</body>
</html>
......@@ -29,12 +29,16 @@ class TestContext(unittest.TestCase):
def test_langs(self):
langs = tesseract.get_available_languages()
self.assertTrue("eng" in langs,
self.assertTrue("eng" in langs,
("English training does not appear to be installed."
" (required for the tests)"))
self.assertTrue("fra" in langs,
("French training does not appear to be installed."
" (required for the tests)"))
self.assertTrue("jpn" in langs,
("Japanese training does not appear to be installed."
" (required for the tests)"))
def tearDown(self):
pass
......@@ -73,6 +77,9 @@ class TestTxt(unittest.TestCase):
def test_french(self):
self.__test_txt('test-french.jpg', 'test-french.txt', 'fra')
def test_japanese(self):
self.__test_txt('test-japanese.jpg', 'test-japanese.txt', 'jpn')
def tearDown(self):
pass
......@@ -111,6 +118,9 @@ class TestCharBox(unittest.TestCase):
def test_french(self):
self.__test_txt('test-french.jpg', 'test-french.box', 'fra')
def test_japanese(self):
self.__test_txt('test-japanese.jpg', 'test-japanese.box', 'jpn')
def test_write_read(self):
original_boxes = tesseract.image_to_string(
Image.open("tests/data/test.png"), builder=self.builder)
......@@ -180,6 +190,9 @@ class TestWordBox(unittest.TestCase):
def test_french(self):
self.__test_txt('test-french.jpg', 'test-french.words', 'fra')
def test_japanese(self):
self.__test_txt('test-japanese.jpg', 'test-japanese.words', 'jpn')
def test_write_read(self):
original_boxes = tesseract.image_to_string(
Image.open("tests/data/test.png"), builder=self.builder)
......@@ -243,6 +256,9 @@ class TestLineBox(unittest.TestCase):
def test_french(self):
self.__test_txt('test-french.jpg', 'test-french.lines', 'fra')
def test_japanese(self):
self.__test_txt('test-japanese.jpg', 'test-japanese.lines', 'jpn')
def test_write_read(self):
original_boxes = tesseract.image_to_string(
Image.open("tests/data/test.png"), builder=self.builder)
......@@ -291,6 +307,7 @@ def get_all_tests():
'test_basic',
'test_european',
'test_french',
'test_japanese',
'test_write_read',
]
tests = unittest.TestSuite(map(TestCharBox, test_names))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment