Different results generated from pyocr and tesseract
Created by: eugene123tw
I am getting different response using tesseract and pyocr,
tesseract c++ code:
vector<vector<int>> getWord(Pix *image){
tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
cout << "Get Words!" << endl;
api->Init(NULL, "eng");
api->SetImage(image);
api->Recognize(0);
vector<vector<int>> bbs;
tesseract::ResultIterator* ri = api->GetIterator();
tesseract::PageIteratorLevel level = tesseract::RIL_WORD;
if (ri != 0) {
do {
const char* word = ri->GetUTF8Text(level);
float conf = ri->Confidence(level);
int x1, y1, x2, y2;
ri->BoundingBox(level, &x1, &y1, &x2, &y2);
printf("word: '%s'; \tconf: %.2f; BoundingBox: %d,%d,%d,%d;\n", word, conf, x1, y1, x2, y2);
vector<int> bb = {x1,y1,x2,y2};
bbs.push_back(bb);
delete[] word;
} while (ri->Next(level));
}
return bbs;
}
Result:
word: 'Teer'; conf: 26.52; BoundingBox: 765,345,953,432;
word: 'ats'; conf: 35.33; BoundingBox: 961,376,1155,432;
word: 'Kelly'; conf: 96.62; BoundingBox: 665,569,900,706;
word: 'Crews'; conf: 88.17; BoundingBox: 940,566,1256,694;
pyocr
python code:
import cv2
from PIL import Image
import sys
import pyocr
import pyocr.builders
filename = "xxx.jpg"
tools = pyocr.get_available_tools()
if len(tools) == 0:
print("No OCR tool found")
sys.exit(1)
# The tools are returned in the recommended order of usage
tool = tools[0]
print("Will use tool '%s'" % (tool.get_name()))
# Ex: Will use tool 'libtesseract'
langs = tool.get_available_languages()
print("Available languages: %s" % ", ".join(langs))
lang = langs[0]
print("Will use lang '%s'" % (lang))
word_boxes = tool.image_to_string(
Image.open(filename),
lang="eng",
builder=pyocr.builders.WordBoxBuilder()
)
for item in word_boxes:
print(item)
Result:
Will use tool 'Tesseract (sh)'
Available languages: eng, osd
Will use lang 'eng'
Process finished with exit code 0