Cannot import PDF if filename contains some special chars
PDF cannot be imported if the filename contains some special characters, such as quotation marks.
Steps to reproduce
- Download sample file:
$ wget https://www.oreilly.com/openbook/beosprog/book/ch01.pdf
- Rename the file or create a symlink for it
$ ln -s ch01.pdf "Joaquim \"jrocha\" Rocha - 'top secret' files.pdf"
$ ls Joaquim*
Joaquim "jrocha" Rocha - 'top secret' files.pdf
-
File
→Import PDF
→Open
Excepted results
Importing PDF.
Actual results
Nothing happened.
Temporary patch
diff --git a/src/ocrfeeder/util/lib.py b/src/ocrfeeder/util/lib.py
index 7766230..fa828fb 100644
--- a/src/ocrfeeder/util/lib.py
+++ b/src/ocrfeeder/util/lib.py
@@ -18,6 +18,7 @@
###########################################################################
import os
+import re
import mimetypes
from PIL import Image
import tempfile
@@ -42,19 +43,54 @@ def getIconOrLabel(icon_name, label_text, icon_size = Gtk.IconSize.SMALL_TOOLBAR
label = None
return icon, label
+def getSafeGhostscriptPath(file_path):
+ return re.sub(r'[^\w !#$%&()*+,./:;<=>?@\[\\\]^_`{|}~-]', '_', file_path)
+
+def getSafeGhostscriptInputFilename(file_name):
+ return re.sub(r'[/]', '_', getSafeGhostscriptPath(file_name))
+
+def getSafeGhostscriptOutputBasename(file_name):
+ return re.sub(r'[%]', '_', getSafeGhostscriptInputFilename(file_name))
+
def convertPdfToImages(pdf_file, temp_dir = '/tmp'):
- dir_name = tempfile.mkdtemp(dir = temp_dir)
+ if not os.path.isfile(pdf_file):
+ debug('Unable to convert PDF: File does not exist: %s', pdf_file)
+ return None
+ try:
+ dir_name = tempfile.mkdtemp(dir = temp_dir)
+ except:
+ debug('Unable to convert PDF: Cannot create temp dir in: %s', temp_dir)
+ return None
debug('Converting PDF: %s to image', pdf_file)
resolution = 300
- file_name = os.path.splitext(os.path.basename(pdf_file))[0]
- command = 'gs -SDEVICE=jpeg -r%(resolution)sx%(resolution)s -sPAPERSIZE=letter ' \
- '-sOutputFile="%(temp_name)s/%(file_name)s_%%04d.jpg" ' \
- '-dNOPAUSE -dBATCH -- "%(pdf_file)s"' % \
+ size = 'letter'
+ file_name = os.path.basename(pdf_file)
+ base_name = os.path.splitext(file_name)[0]
+ pdf_path = pdf_file
+ file_name_safe = getSafeGhostscriptInputFilename(file_name)
+ base_name_safe = getSafeGhostscriptOutputBasename(base_name)
+ pdf_file_safe = getSafeGhostscriptPath(pdf_file)
+ if pdf_file != pdf_file_safe:
+ try:
+ pdf_path = os.path.join(dir_name, file_name_safe)
+ os.symlink(pdf_file, pdf_path)
+ except:
+ debug('Unable to convert PDF: Cannot create temp symlink in: %s', dir_name)
+ return None
+ command = 'gs -SDEVICE=jpeg -r%(resolution)sx%(resolution)s -sPAPERSIZE=%(size)s ' \
+ '-sOutputFile=\'%(temp_name)s/%(file_name)s_%%04d.jpg\' ' \
+ '-dNOPAUSE -dBATCH -- \'%(pdf_file)s\'' % \
{'temp_name': dir_name,
- 'file_name': file_name,
- 'pdf_file': pdf_file,
- 'resolution': resolution}
+ 'file_name': base_name_safe,
+ 'pdf_file': pdf_path,
+ 'resolution': resolution,
+ 'size': size}
os.popen(command).read()
+ if pdf_file != pdf_file_safe:
+ try:
+ os.unlink(pdf_path)
+ except:
+ debug('PDF conversion warning: Cannot remove temp symlink: %s', pdf_path)
return dir_name
def getImagesFromFolder(folder):
PR is on the way.