Commit 7b16fa8c authored by Thomas Perret's avatar Thomas Perret

Add cuneiform tests

parent 8d1011dc
......@@ -23,7 +23,7 @@ import subprocess
import tempfile
from . import builders
from . import error
from .error import CuneiformError
from . import util
......@@ -63,30 +63,15 @@ def get_available_builders():
return [
builders.TextBuilder,
builders.WordBoxBuilder,
builders.LineBoxBuilder,
]
class CuneiformError(error.PyocrException):
def __init__(self, status, message):
error.PyocrException.__init__(self, message)
self.status = status
self.message = message
self.args = (status, message)
def temp_file(suffix):
''' Returns a temporary file '''
return tempfile.NamedTemporaryFile(prefix='cuneiform_', suffix=suffix)
def cleanup(filename):
''' Tries to remove the given filename. Ignores non-existent files '''
try:
os.remove(filename)
except OSError:
pass
def image_to_string(image, lang=None, builder=None):
if builder is None:
builder = builders.TextBuilder()
......@@ -152,8 +137,8 @@ def get_version():
proc.wait()
for line in output.split("\n"):
m = VERSION_LINE_RE.match(line)
g = m.groups()
if m is not None:
g = m.groups()
ver = (int(g[0]), int(g[1]), int(g[2]))
return ver
return None
......@@ -11,3 +11,11 @@ class TesseractError(PyocrException):
self.status = status
self.message = message
self.args = (status, message)
class CuneiformError(PyocrException):
def __init__(self, status, message):
PyocrException.__init__(self, message)
self.status = status
self.message = message
self.args = (status, message)
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html><head><title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" >
<meta name='ocr-system' content='openocr'>
</head>
<body><div class='ocr_page' id='page_1' title='image "input/specific/test-european.jpg"; bbox 0 0 1024 800'>
<p><span class='ocr_line' id='line_1' title="bbox 105 67 823 114">The (qui ck) [brown] { fox ) jumps! <span class='ocr_cinfo' title="x_bboxes 105 67 130 96 136 67 155 97 161 76 178 98 -1 -1 -1 -1 205 69 216 105 222 76 241 107 247 78 266 99 273 68 281 98 -1 -1 -1 -1 288 77 305 100 310 69 329 99 335 71 347 106 -1 -1 -1 -1 376 70 386 108 391 70 415 101 415 70 439 101 439 70 463 101 463 70 487 101 487 70 511 101 518 72 528 110 -1 -1 -1 -1 559 73 572 110 -1 -1 -1 -1 580 72 598 103 597 81 617 104 622 83 642 103 -1 -1 -1 -1 650 74 663 111 -1 -1 -1 -1 687 74 700 113 706 84 727 106 732 83 764 105 768 84 789 114 794 84 809 107 817 75 823 107 -1 -1 -1 -1 "></span><br></span>
<span class='ocr_line' id='line_2' title="bbox 104 116 887 166">Over the $43,456.78 &lt;lazy&gt; ¹90 dog <span class='ocr_cinfo' title="x_bboxes 104 116 133 147 137 126 157 147 162 126 179 148 185 126 199 147 -1 -1 -1 -1 224 122 235 149 241 118 261 148 266 127 283 149 -1 -1 -1 -1 310 118 328 152 334 120 353 149 360 120 376 150 385 146 392 156 399 121 418 150 425 121 442 151 449 122 468 151 476 147 481 152 490 122 508 151 515 122 533 152 -1 -1 -1 -1 561 127 583 148 589 122 598 153 604 131 622 154 627 133 645 154 649 133 668 163 673 128 696 149 -1 -1 -1 -1 722 124 742 155 748 125 766 155 772 126 791 155 -1 -1 -1 -1 818 126 838 157 843 135 862 157 867 135 887 166 -1 -1 -1 -1 "></span><br></span>
<span class='ocr_line' id='line_3' title="bbox 103 166 835 207"><i>Ec </i>duck/goose, as 12.5'lo of E-mail <span class='ocr_cinfo' title="x_bboxes 103 166 125 197 121 176 134 197 -1 -1 -1 -1 160 167 180 198 186 177 205 198 210 176 227 199 232 168 252 198 257 168 269 199 273 177 293 207 298 177 318 200 322 178 342 200 347 178 361 200 366 178 383 201 389 196 396 206 -1 -1 -1 -1 424 179 442 202 448 179 463 202 -1 -1 -1 -1 493 172 504 202 514 172 532 202 540 198 545 203 554 173 572 202 578 172 593 189 584 173 607 203 599 187 614 204 -1 -1 -1 -1 638 182 658 205 662 173 680 204 -1 -1 -1 -1 700 175 725 205 731 191 744 197 749 183 781 206 785 184 803 207 809 175 819 206 825 175 835 206 -1 -1 -1 -1 "></span><br></span>
<span class='ocr_line' id='line_4' title="bbox 103 216 911 265">from aspammerQawebsite.corn is spam. <span class='ocr_cinfo' title="x_bboxes 103 216 120 246 121 225 135 246 138 225 158 248 164 226 194 247 -1 -1 -1 -1 220 226 238 249 244 227 259 249 263 227 284 257 288 227 306 250 312 228 344 249 349 228 381 250 385 229 403 251 408 229 423 250 427 220 464 261 434 228 457 251 469 231 498 252 502 230 520 253 524 222 544 253 549 231 564 253 569 222 579 253 585 227 596 254 600 231 618 254 625 249 630 254 637 232 656 255 656 232 675 255 675 232 694 255 694 232 713 255 -1 -1 -1 -1 742 224 752 255 758 234 773 256 -1 -1 -1 -1 799 234 814 257 818 234 839 265 843 235 862 258 867 235 899 257 905 253 911 258 -1 -1 -1 -1 "></span><br></span>
<span class='ocr_line' id='line_5' title="bbox 102 267 877 315">Der „schnelle" braune Fuchs springt <span class='ocr_cinfo' title="x_bboxes 102 267 130 296 136 275 153 298 159 276 173 297 -1 -1 -1 -1 198 293 214 303 221 276 235 299 240 277 257 299 263 268 283 299 289 277 309 299 314 278 331 300 337 269 346 299 353 269 362 300 368 278 385 301 390 270 406 280 -1 -1 -1 -1 433 270 453 302 459 280 473 301 477 280 495 302 500 281 520 303 526 280 546 302 551 281 568 303 -1 -1 -1 -1 594 273 617 303 621 283 641 304 646 282 663 305 669 274 689 304 694 283 709 305 -1 -1 -1 -1 735 283 750 306 754 284 775 314 780 284 795 306 799 275 809 306 815 284 836 306 840 285 861 315 866 281 877 308 -1 -1 -1 -1 "></span><br></span>
<span class='ocr_line' id='line_6' title="bbox 102 316 918 358">uber den faulen Hund. Le renard brun <span class='ocr_cinfo' title="x_bboxes 102 326 121 347 126 316 145 347 150 325 167 348 173 326 187 347 -1 -1 -1 -1 212 318 232 349 237 327 254 349 260 327 280 348 -1 -1 -1 -1 306 319 323 349 323 328 341 350 346 329 366 350 372 320 381 350 387 329 404 351 410 329 430 351 -1 -1 -1 -1 456 321 485 351 489 331 509 352 515 330 536 352 540 322 560 353 567 348 572 353 -1 -1 -1 -1 601 323 625 353 630 332 648 355 -1 -1 -1 -1 674 332 689 354 692 333 710 355 715 333 736 355 740 333 759 356 764 334 779 355 782 325 803 357 -1 -1 -1 -1 827 326 848 357 853 335 868 357 871 336 892 358 897 335 918 357 -1 -1 -1 -1 "></span><br></span>
<span class='ocr_line' id='line_7' title="bbox 101 367 833 410">«rapide» saute par-dessus le chien <span class='ocr_cinfo' title="x_bboxes 101 376 119 395 127 375 141 397 144 375 162 398 167 376 188 406 194 367 202 398 208 368 228 399 233 377 251 399 256 378 274 397 -1 -1 -1 -1 302 377 317 400 321 378 339 400 345 379 364 401 370 374 380 401 385 379 403 401 -1 -1 -1 -1 428 379 449 410 453 379 471 402 477 380 492 401 496 388 509 394 513 372 534 403 538 381 556 403 561 381 576 404 581 381 596 404 600 382 621 404 626 382 641 405 -1 -1 -1 -1 667 373 677 404 682 382 700 405 -1 -1 -1 -1 725 383 743 406 748 375 769 405 774 375 784 406 789 384 807 407 812 384 833 406 -1 -1 -1 -1 "></span><br></span>
<span class='ocr_line' id='line_8' title="bbox 100 420 859 465">paresseux. La volpe marrone rapida <span class='ocr_cinfo' title="x_bboxes 100 425 120 455 125 425 143 447 149 426 163 447 167 426 184 448 190 426 204 448 210 426 224 449 229 427 246 449 251 428 271 449 276 428 296 449 303 445 308 450 -1 -1 -1 -1 337 420 361 450 366 428 384 451 -1 -1 -1 -1 409 430 429 451 434 429 453 452 459 421 468 451 473 430 494 460 499 430 516 453 -1 -1 -1 -1 543 431 574 453 579 431 597 454 602 432 617 453 621 432 636 454 639 432 659 455 664 432 685 454 690 433 707 456 -1 -1 -1 -1 733 433 748 455 751 433 770 456 774 434 795 465 800 425 810 456 815 426 837 457 840 435 859 458 -1 -1 -1 -1 "></span><br></span>
<span class='ocr_line' id='line_9' title="bbox 100 467 834 512">salta sopra il cane pigro. El zorro <span class='ocr_cinfo' title="x_bboxes 100 475 114 497 119 475 137 497 143 467 152 497 159 471 169 498 174 476 192 498 -1 -1 -1 -1 219 476 233 499 238 477 257 499 262 477 282 508 288 478 302 499 306 478 324 500 -1 -1 -1 -1 351 469 360 500 367 469 376 500 -1 -1 -1 -1 403 479 420 501 425 479 443 502 449 479 469 501 474 480 491 502 -1 -1 -1 -1 517 480 538 511 543 472 553 502 558 481 579 512 584 481 599 503 602 481 622 504 628 499 633 504 -1 -1 -1 -1 662 474 687 504 693 474 703 505 -1 -1 -1 -1 729 484 748 505 752 483 772 506 776 484 792 506 795 484 811 506 814 484 834 507 -1 -1 -1 -1 "></span><br></span>
<span class='ocr_line' id='line_10' title="bbox 99 519 833 564">marron rapido salta sobre el perro <span class='ocr_cinfo' title="x_bboxes 99 525 130 547 135 525 153 548 159 526 174 547 178 526 193 547 196 526 216 549 221 527 242 548 -1 -1 -1 -1 268 527 283 549 286 527 304 550 309 528 330 558 335 519 345 550 350 520 371 551 375 528 395 551 -1 -1 -1 -1 421 529 436 552 440 529 459 552 464 521 474 551 479 525 490 552 495 530 513 553 -1 -1 -1 -1 540 531 555 553 559 531 579 554 582 522 603 554 608 532 623 553 626 532 644 555 -1 -1 -1 -1 669 532 687 555 692 524 702 555 -1 -1 -1 -1 728 533 749 564 753 534 771 556 776 534 791 556 795 534 810 556 813 534 833 557 -1 -1 -1 -1 "></span><br></span>
<span class='ocr_line' id='line_11' title="bbox 98 569 829 614">perezoso. A raposa marrom rapida <span class='ocr_cinfo' title="x_bboxes 98 575 118 605 123 575 140 598 146 576 160 597 164 576 181 598 186 577 204 598 209 576 228 599 233 577 248 599 253 577 272 599 279 594 284 599 -1 -1 -1 -1 313 569 342 599 -1 -1 -1 -1 369 579 383 600 387 579 405 601 409 579 430 610 435 579 454 602 459 580 474 602 478 580 497 603 -1 -1 -1 -1 523 580 555 602 560 581 578 604 583 581 598 603 602 582 617 603 621 582 640 605 645 582 677 604 -1 -1 -1 -1 703 583 718 605 721 583 740 606 744 583 765 614 770 575 780 606 785 575 806 607 810 584 829 607 -1 -1 -1 -1 "></span><br></span>
<span class='ocr_line' id='line_12' title="bbox 98 617 710 662">salta sobre o cao preguiqoso.<span class='ocr_cinfo' title="x_bboxes 98 625 112 647 117 625 135 647 141 617 150 647 157 621 167 648 172 626 190 648 -1 -1 -1 -1 217 626 231 649 236 627 255 649 259 618 279 649 285 627 299 649 303 628 320 650 -1 -1 -1 -1 346 628 366 651 -1 -1 -1 -1 391 629 408 651 413 629 431 652 436 629 456 652 -1 -1 -1 -1 481 630 502 661 507 630 522 652 525 630 543 653 547 631 568 662 572 632 593 654 598 622 608 653 613 632 631 661 635 632 655 655 660 632 675 655 679 632 699 655 705 650 710 655 "></span></span>
</p>
<p><span class='ocr_line' id='line_13' title="bbox 0 0 0 0"></span>
</p>
</div></body></html>
Phrase en français.
Avec des accents.
Éphémère
import subprocess
from io import StringIO
from unittest.mock import patch, MagicMock
from PIL import Image
from pyocr import cuneiform
from pyocr import builders
from .tests_base import BaseTest
class TestCuneiform(BaseTest):
"""
These tests make sure the requirements for the tests are met.
"""
@patch("pyocr.util.is_on_path")
def test_available(self, is_on_path):
# XXX is it useful?
is_on_path.return_value = True
self.assertTrue(cuneiform.is_available())
is_on_path.assert_called_once_with("cuneiform")
@patch("subprocess.Popen")
def test_version(self, Popen):
stdout = MagicMock()
stdout.stdout.read.return_value = (
"Cuneiform for Linux 1.1.0\n"
"Usage: cuneiform [-l languagename -f format --dotmatrix --fax"
" --singlecolumn -o result_file] imagefile"
).encode()
Popen.return_value = stdout
self.assertSequenceEqual(cuneiform.get_version(), (1, 1, 0))
@patch("subprocess.Popen")
def test_version_error(self, Popen):
stdout = MagicMock()
stdout.stdout.read.return_value = "\n".encode()
Popen.return_value = stdout
self.assertIsNone(cuneiform.get_version())
@patch("subprocess.Popen")
def test_langs(self, Popen):
stdout = MagicMock()
stdout.stdout.read.return_value = (
"Cuneiform for Linux 1.1.0\n"
"Supported languages: eng ger fra rus swe spa ita ruseng ukr srp "
"hrv pol dan por dut cze rum hun bul slv lav lit est tur."
).encode()
Popen.return_value = stdout
langs = cuneiform.get_available_languages()
self.assertIn("eng", langs)
self.assertIn("fra", langs)
Popen.assert_called_once_with(
["cuneiform", "-l"],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT
)
def test_name(self):
self.assertEqual(cuneiform.get_name(), "Cuneiform (sh)")
def test_can_detect_orientation(self):
self.assertFalse(cuneiform.can_detect_orientation())
def test_available_builders(self):
self.assertListEqual(
cuneiform.get_available_builders(),
[
builders.TextBuilder,
builders.WordBoxBuilder,
builders.LineBoxBuilder,
]
)
class TestCuneiformTxt(BaseTest):
"""
These tests make sure the "usual" OCR works fine. (the one generating
a .txt file)
"""
@patch("pyocr.tesseract.get_version")
def setUp(self, get_version):
get_version.return_value = (4, 0, 0)
self.builder = builders.TextBuilder()
self.image = Image.open(self._get_file_path("text.jpg"))
self.stdout = MagicMock()
self.stdout.stdout.read.return_value = (
"Cuneiform for Linux 1.1.0\n".encode()
)
self.stdout.wait.return_value = 0
self.tmp_filename = "/tmp/cuneiform_n0qfk87otxt"
self.enter = MagicMock()
self.enter.__enter__.return_value = MagicMock()
self.enter.__enter__.return_value.configure_mock(name=self.tmp_filename)
@patch("pyocr.tesseract.get_version")
@patch("pyocr.cuneiform.temp_file")
@patch("codecs.open")
@patch("subprocess.Popen")
def test_image_to_string_defaults_to_text_buidler(self, Popen, copen,
temp_file, get_version):
get_version.return_value = (4, 0, 0)
Popen.return_value = self.stdout
copen.return_value = StringIO(self._get_file_content("text"))
temp_file.return_value = self.enter
output = cuneiform.image_to_string(self.image)
self.assertEqual(output, self._get_file_content("text").strip())
Popen.assert_called_once_with(
["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
@patch("pyocr.cuneiform.temp_file")
@patch("codecs.open")
@patch("subprocess.Popen")
def test_lang(self, Popen, copen, temp_file):
Popen.return_value = self.stdout
copen.return_value = StringIO(self._get_file_content("text"))
temp_file.return_value = self.enter
output = cuneiform.image_to_string(self.image, lang="fra",
builder=self.builder)
self.assertEqual(output, self._get_file_content("text").strip())
Popen.assert_called_once_with(
["cuneiform", "-l", "fra", "-f", "text", "-o", self.tmp_filename,
"-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
@patch("pyocr.cuneiform.temp_file")
@patch("codecs.open")
@patch("subprocess.Popen")
def test_text(self, Popen, copen, temp_file):
Popen.return_value = self.stdout
copen.return_value = StringIO(self._get_file_content("text"))
temp_file.return_value = self.enter
output = cuneiform.image_to_string(self.image,
builder=self.builder)
self.assertEqual(output, self._get_file_content("text").strip())
Popen.assert_called_once_with(
["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
@patch("subprocess.Popen")
def test_text_error(self, Popen):
message = ("Cuneiform for Linux 1.1.0\n"
"Magick: Improper image header (example.png) reported by "
"coders/png.c:2932 (ReadPNGImage)\n")
self.stdout.stdout.read.return_value = message.encode()
self.stdout.wait.return_value = 1
Popen.return_value = self.stdout
with self.assertRaises(cuneiform.CuneiformError) as ce:
cuneiform.image_to_string(self.image, builder=self.builder)
self.assertEqual(ce.exception.status, 1)
self.assertEqual(ce.exception.message, message)
class TestCuneiformDigits(BaseTest):
@patch("pyocr.tesseract.get_version")
def setUp(self, get_version):
get_version.return_value = (4, 0, 0)
self.builder = builders.DigitBuilder()
def test_digits_not_implemented(self):
image = Image.open(self._get_file_path("digits.png"))
with self.assertRaises(NotImplementedError):
cuneiform.image_to_string(image, builder=self.builder)
def test_digits_box_not_implemented(self):
image = Image.open(self._get_file_path("digits.png"))
with self.assertRaises(NotImplementedError):
cuneiform.image_to_string(image,
builder=self.builder)
class TestCuneiformWordBox(BaseTest):
"""
These tests make sure that cuneiform box handling works fine.
"""
@patch("pyocr.tesseract.get_version")
def setUp(self, get_version):
get_version.return_value = (4, 0, 0)
self.builder = builders.WordBoxBuilder()
self.image = Image.open(self._get_file_path("paragraph.jpg"))
self.stdout = MagicMock()
self.stdout.stdout.read.return_value = (
"Cuneiform for Linux 1.1.0\n".encode()
)
self.stdout.wait.return_value = 0
self.tmp_filename = "/tmp/cuneiform_n0qfk87otxt"
self.enter = MagicMock()
self.enter.__enter__.return_value = MagicMock()
self.enter.__enter__.return_value.configure_mock(name=self.tmp_filename)
@patch("pyocr.cuneiform.temp_file")
@patch("codecs.open")
@patch("subprocess.Popen")
def test_word(self, Popen, copen, temp_file):
Popen.return_value = self.stdout
copen.return_value = StringIO(self._get_file_content("cuneiform.words"))
temp_file.return_value = self.enter
output = cuneiform.image_to_string(self.image,
builder=self.builder)
Popen.assert_called_once_with(
["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
for box in output:
self.assertIsInstance(box, builders.Box)
@patch("subprocess.Popen")
def test_word_error(self, Popen):
stdout = MagicMock()
message = ("Cuneiform for Linux 1.1.0\n"
"Magick: Improper image header (example.png) reported by "
"coders/png.c:2932 (ReadPNGImage)\n")
stdout.stdout.read.return_value = message.encode()
stdout.wait.return_value = 1
Popen.return_value = stdout
with self.assertRaises(cuneiform.CuneiformError) as ce:
cuneiform.image_to_string(self.image,
builder=self.builder)
self.assertEqual(ce.exception.status, 1)
self.assertEqual(ce.exception.message, message)
class TestCuneiformLineBox(BaseTest):
"""
These tests make sure that cuneiform box handling works fine.
"""
@patch("pyocr.tesseract.get_version")
def setUp(self, get_version):
get_version.return_value = (4, 0, 0)
self.builder = builders.LineBoxBuilder()
self.image = Image.open(self._get_file_path("paragraph.jpg"))
self.stdout = MagicMock()
self.stdout.stdout.read.return_value = (
"Cuneiform for Linux 1.1.0\n".encode()
)
self.stdout.wait.return_value = 0
self.tmp_filename = "/tmp/cuneiform_n0qfk87otxt"
self.enter = MagicMock()
self.enter.__enter__.return_value = MagicMock()
self.enter.__enter__.return_value.configure_mock(name=self.tmp_filename)
@patch("pyocr.cuneiform.temp_file")
@patch("codecs.open")
@patch("subprocess.Popen")
def test_line(self, Popen, copen, temp_file):
Popen.return_value = self.stdout
copen.return_value = StringIO(self._get_file_content("cuneiform.lines"))
temp_file.return_value = self.enter
output = cuneiform.image_to_string(self.image,
builder=self.builder)
Popen.assert_called_once_with(
["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
for box in output:
self.assertIsInstance(box, builders.LineBox)
@patch("subprocess.Popen")
def test_line_error(self, Popen):
message = ("Cuneiform for Linux 1.1.0\n"
"Magick: Improper image header (example.png) reported by "
"coders/png.c:2932 (ReadPNGImage)\n")
self.stdout.stdout.read.return_value = message.encode()
self.stdout.wait.return_value = 1
Popen.return_value = self.stdout
with self.assertRaises(cuneiform.CuneiformError) as ce:
cuneiform.image_to_string(self.image,
builder=self.builder)
self.assertEqual(ce.exception.status, 1)
self.assertEqual(ce.exception.message, message)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment