Problem allocate memory
Created by: SylvainBert
Hi, I wanna use a loop with python to transfrom several pdf in text but after 100 files i have this error :
Traceback (most recent call last):
File "Transform_text.py", line 68, in <module>
tmp
File "/usr/lib/python3.6/site-packages/pyocr/tesseract.py", line 365, in image_to_string
configs=builder.tesseract_configs)
File "/usr/lib/python3.6/site-packages/pyocr/tesseract.py", line 281, in run_tesseract
stderr=subprocess.STDOUT)
File "/usr/lib64/python3.6/subprocess.py", line 709, in __init__
restore_signals, start_new_session)
File "/usr/lib64/python3.6/subprocess.py", line 1275, in _execute_child
restore_signals, start_new_session, preexec_fn)
OSError: [Errno 12] Cannot allocate memory
the code is :
from wand.image import Image
from PIL import Image as PI
import io
import os
from os import listdir
from os.path import isfile, join
import json
import time
import pyocr
import subprocess
import shlex
import glob
tool = pyocr.get_available_tools()[0]
mypath = "/home/sylvain/CV_train/"
dossiers = os.listdir(mypath)
onlyfiles = []
for dos in dossiers:
path = mypath+dos
onlyfiles.append([f for f in listdir(path) if isfile(join(path, f))])
cv_txt = mypath+'/../Cv_text'
if not os.path.exists(cv_txt):
os.makedirs(cv_txt)
start_time = time.time()
pdf = 0
miss = 0
for index,fil in enumerate(onlyfiles):
print("\n dossier : "+dossiers[index])
fich = 0
for ff in fil:
if(ff.split('.')[-1] == 'pdf'):
fich = fich+1
print("\t fichier numero : "+str(fich))
pdf = pdf+1
file = mypath+dossiers[index]+'/'+ff
outfile = ff.split('.')[0]+'.txt'
outpath = cv_txt+'/'+ outfile
if not os.path.exists(outpath):
req_image = []
final_text = []
with Image(filename=file, resolution=300) as image_pdf:
image_jpeg = image_pdf.convert('jpeg')
for img in image_jpeg.sequence:
img_page = Image(image=img)
req_image.append(img_page.make_blob('jpeg'))
for img in req_image:
# I know the problem is from here, but i don't know what to do
with PI.open(io.BytesIO(img)) as tmp:
txt = tool.image_to_string(
tmp
)
final_text.append(txt)
decoup_text=[]
for dec in final_text:
decoup_text.append(dec.split("\n"))
text_complet = ''
for part in decoup_text:
for partie in part:
if(partie!=''):
text_complet = text_complet+' '+ partie
with io.open(outpath, 'w',encoding='utf-8') as outfile:
json.dump(text_complet, outfile,ensure_ascii=False)
Have a nice day!