Commit 00b01992 authored by Matthias Kraus's avatar Matthias Kraus

First shot at pyocr.libtesseract.image_to_pdf for multiple pages

parent 625b2a2a
......@@ -176,19 +176,19 @@ def image_to_string(image, lang=None, builder=None):
return builder.get_output()
def image_to_pdf(image, output_file, lang=None, input_file="stdin",
def images_to_pdf(images, output_file, lang=None, input_files=["stdin"],
textonly=False):
'''
Creates pdf file with embeded text based on OCR from an image
Args:
image: image to be converted
image: images to be converted to one pdf
output_file: path to the file that will be created, `.pdf` extension
should not be specified
lang: three letter language code. For available languages see
https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages.
Defaults to None.
input_file: path to the image file that should be beneath the text in
input_file: paths to the image files that should be beneath the text pages in
output pdf. If not specified (stdin, incorrect file) output pdf is
correct but tesseract writes some errors about not being able to
open the file. Defaults to stdin.
......@@ -198,21 +198,27 @@ def image_to_pdf(image, output_file, lang=None, input_file="stdin",
handle = tesseract_raw.init(lang=lang)
renderer = None
try:
tesseract_raw.set_image(handle, image)
tesseract_raw.set_page_seg_mode(
handle, tesseract_raw.PageSegMode.AUTO_OSD
)
tesseract_raw.set_input_name(handle, input_file)
tesseract_raw.recognize(handle)
renderer = tesseract_raw.init_pdf_renderer(
handle, output_file, textonly
)
assert(renderer)
tesseract_raw.begin_document(renderer, "")
tesseract_raw.add_renderer_image(handle, renderer)
if len(input_files) == 1 and len(input_files) < len(images):
input_files = len(images) * input_files
for image, input_file in zip(images, input_files):
tesseract_raw.set_image(handle, image)
tesseract_raw.set_input_name(handle, input_file)
tesseract_raw.recognize(handle)
tesseract_raw.add_renderer_image(handle, renderer)
tesseract_raw.end_document(renderer)
finally:
tesseract_raw.cleanup(handle)
......@@ -220,6 +226,28 @@ def image_to_pdf(image, output_file, lang=None, input_file="stdin",
tesseract_raw.cleanup(renderer)
def image_to_pdf(image, output_file, lang=None, input_file="stdin",
textonly=False):
'''
Creates pdf file with embeded text based on OCR from an image
Args:
image: image to be converted
output_file: path to the file that will be created, `.pdf` extension
should not be specified
lang: three letter language code. For available languages see
https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages.
Defaults to None.
input_file: path to the image file that should be beneath the text in
output pdf. If not specified (stdin, incorrect file) output pdf is
correct but tesseract writes some errors about not being able to
open the file. Defaults to stdin.
textonly: create pdf with only one invisible text layer. Defaults to
False.
'''
images_to_pdf([image], output_file, lang, [input_file], textonly)
def is_available():
available = tesseract_raw.is_available()
if not available:
......
......@@ -1379,6 +1379,7 @@ class TestLibTesseractPDF(BaseTest):
def setUp(self):
self.image = Image.new(mode="RGB", size=(1, 1))
self.image2 = Image.new(mode="RGB", size=(1, 1))
self.handle = randint(0, 2**32-1)
@patch("pyocr.libtesseract.tesseract_raw")
......@@ -1407,6 +1408,37 @@ class TestLibTesseractPDF(BaseTest):
[call(self.handle), call(renderer)]
)
@patch("pyocr.libtesseract.tesseract_raw")
def test_multipage_pdf(self, raw):
renderer = randint(0, 2 ** 32 - 1)
raw.init.return_value = self.handle
raw.init_pdf_renderer.return_value = renderer
libtesseract.images_to_pdf([self.image, self.image2], "output")
raw.init.assert_called_once_with(lang=None)
raw.set_image.assert_called_with(self.handle, self.image)
raw.set_image.assert_called_with(self.handle, self.image2)
raw.set_page_seg_mode.assert_called_once_with(
self.handle, raw.PageSegMode.AUTO_OSD
)
raw.set_input_name.assert_called_with(self.handle, "stdin")
raw.set_input_name.assert_called_with(self.handle, "stdin")
raw.recognize.assert_called_with(self.handle)
raw.recognize.assert_called_with(self.handle)
raw.init_pdf_renderer.assert_called_once_with(
self.handle, "output", False
)
raw.begin_document.assert_called_once_with(renderer, "")
raw.add_renderer_image.assert_called_with(self.handle,
renderer)
raw.add_renderer_image.assert_called_with(self.handle,
renderer)
raw.end_document.assert_called_once_with(renderer)
self.assertListEqual(
raw.cleanup.call_args_list,
[call(self.handle), call(renderer)]
)
@patch("pyocr.libtesseract.tesseract_raw")
def test_pdf_renderer_error(self, raw):
renderer = None
......@@ -1417,15 +1449,15 @@ class TestLibTesseractPDF(BaseTest):
libtesseract.image_to_pdf(self.image, "output")
raw.init.assert_called_once_with(lang=None)
raw.set_image.assert_called_once_with(self.handle, self.image)
raw.set_page_seg_mode.assert_called_once_with(
self.handle, raw.PageSegMode.AUTO_OSD
)
raw.set_input_name.assert_called_once_with(self.handle, "stdin")
raw.recognize.assert_called_once_with(self.handle)
raw.init_pdf_renderer.assert_called_once_with(
self.handle, "output", False
)
self.assertFalse(raw.set_image.called)
self.assertFalse(raw.set_input_name.called)
self.assertFalse(raw.recognize.called)
self.assertFalse(raw.begin_document.called)
self.assertFalse(raw.add_renderer_image.called)
self.assertFalse(raw.end_document.called)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment