From f3d7bd983b882bbe6b166bc40948c156795a43f8 Mon Sep 17 00:00:00 2001 From: sprovence Date: Fri, 4 Jun 2021 17:06:14 -0500 Subject: [PATCH 01/10] Fixed run_path in PyTessy __init__ --- source/pytessy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/pytessy.py b/source/pytessy.py index 6137fc4..fa45079 100644 --- a/source/pytessy.py +++ b/source/pytessy.py @@ -33,7 +33,7 @@ import __main__ import ctypes import ctypes.util -from os import chdir, environ +from os import chdir, environ, getcwd from os.path import abspath, dirname, isabs, isdir, isfile, join from sys import platform @@ -259,7 +259,7 @@ def __init__(self, tesseract_path=None, api_version=None, lib_path=None, FileNotFoundError If cannot found "tessdata" directory. """ - run_path = dirname(abspath(__main__.__file__)) + run_path = dirname(abspath(getcwd())) no_lib = True if lib_path is not None: if isfile(lib_path): From cd061aa20a3b794f1d7a6ca2084ee3ff76097c82 Mon Sep 17 00:00:00 2001 From: sprovence Date: Fri, 4 Jun 2021 17:23:24 -0500 Subject: [PATCH 02/10] added setup.py --- source/setup.py | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 source/setup.py diff --git a/source/setup.py b/source/setup.py new file mode 100644 index 0000000..26126d8 --- /dev/null +++ b/source/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup(use_scm_version=True) \ No newline at end of file From d3d3b3b62cee28ebbf93030dff08106363a021f4 Mon Sep 17 00:00:00 2001 From: sprovence Date: Mon, 7 Jun 2021 11:42:42 -0500 Subject: [PATCH 03/10] Added setup.py --- source/setup.py => setup.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename source/setup.py => setup.py (100%) diff --git a/source/setup.py b/setup.py similarity index 100% rename from source/setup.py rename to setup.py From 4181f70788eb047ce9938128baa6d913a41e69dc Mon Sep 17 00:00:00 2001 From: Mark Haley Date: Mon, 7 Jun 2021 13:49:58 -0400 Subject: [PATCH 04/10] Add setup.cfg file. --- setup.cfg | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 setup.cfg diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..2eace54 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,15 @@ +[metadata] +name = pytessy +license = Internal +long_description = file: README.md +long_description_content_type = text/markdown; charset=UTF-8 +platform = any + +[options] +python_requires = >=3.6 +packages = pytessy + +setup_requires = + setuptools >=38.3.0 + pip >= 20.0 + setuptools_scm From 186e79e7aa370ef29d2bbbff1fad2304dfd3e18d Mon Sep 17 00:00:00 2001 From: Mark Haley Date: Mon, 7 Jun 2021 13:59:49 -0400 Subject: [PATCH 05/10] Testing. --- setup.cfg | 15 --------------- setup.py | 3 --- 2 files changed, 18 deletions(-) delete mode 100644 setup.cfg delete mode 100644 setup.py diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 2eace54..0000000 --- a/setup.cfg +++ /dev/null @@ -1,15 +0,0 @@ -[metadata] -name = pytessy -license = Internal -long_description = file: README.md -long_description_content_type = text/markdown; charset=UTF-8 -platform = any - -[options] -python_requires = >=3.6 -packages = pytessy - -setup_requires = - setuptools >=38.3.0 - pip >= 20.0 - setuptools_scm diff --git a/setup.py b/setup.py deleted file mode 100644 index 26126d8..0000000 --- a/setup.py +++ /dev/null @@ -1,3 +0,0 @@ -from setuptools import setup - -setup(use_scm_version=True) \ No newline at end of file From 638a53b425e746ccc5e3017c71387e2cb9763e52 Mon Sep 17 00:00:00 2001 From: Mark Haley Date: Mon, 7 Jun 2021 14:25:09 -0400 Subject: [PATCH 06/10] Re-add setup.cfg and setup.py. --- setup.cfg | 18 ++++++++++++++++++ setup.py | 3 +++ 2 files changed, 21 insertions(+) create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..787f656 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,18 @@ +[metadata] +name = pytessy +license = Internal +long_description = file: README.md +long_description_content_type = text/markdown; charset=UTF-8 +platform = any + +[options] +name = pytessy +python_requires = >=3.8 +packages = pytessy +package_dir = + pytessy=source + +setup_requires = + setuptools >=38.3.0 + pip >= 20.0 + setuptools_scm diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..d5d43d7 --- /dev/null +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup(use_scm_version=True) From 7985603c9a480868e87186fbf8c0c72d709a118e Mon Sep 17 00:00:00 2001 From: sprovence Date: Mon, 7 Jun 2021 16:43:34 -0500 Subject: [PATCH 07/10] changed run_path --- source/pytessy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/pytessy.py b/source/pytessy.py index fa45079..caa5164 100644 --- a/source/pytessy.py +++ b/source/pytessy.py @@ -259,7 +259,7 @@ def __init__(self, tesseract_path=None, api_version=None, lib_path=None, FileNotFoundError If cannot found "tessdata" directory. """ - run_path = dirname(abspath(getcwd())) + run_path = getcwd() no_lib = True if lib_path is not None: if isfile(lib_path): From c66dd139ff618dcf2267c42a10a992f26ae875b8 Mon Sep 17 00:00:00 2001 From: sprovence Date: Tue, 8 Jun 2021 10:33:33 -0500 Subject: [PATCH 08/10] Removed chdir --- source/pytessy.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/source/pytessy.py b/source/pytessy.py index caa5164..0417d56 100644 --- a/source/pytessy.py +++ b/source/pytessy.py @@ -91,7 +91,7 @@ def __init__(self, lib_path=None, data_path=None, language='eng'): - def get_text(self): + def get_text(self, config=None): """ Gets text as utf-8 decoded string --------------------------------- @@ -99,7 +99,10 @@ def get_text(self): """ self._check_setup() - result = self._lib.TessBaseAPIGetUTF8Text(self._api) + if config: + result = self._lib.TessBaseAPIGetUTF8Text(self._api, config=config) + else: + result = self._lib.TessBaseAPIGetUTF8Text(self._api) if result: return result.decode('utf-8') @@ -258,8 +261,7 @@ def __init__(self, tesseract_path=None, api_version=None, lib_path=None, search process. FileNotFoundError If cannot found "tessdata" directory. """ - - run_path = getcwd() + run_path = dirname(abspath(__main__.__file__)) no_lib = True if lib_path is not None: if isfile(lib_path): @@ -317,15 +319,13 @@ def __init__(self, tesseract_path=None, api_version=None, lib_path=None, break if data_path is None: raise FileNotFoundError('PyTessy: Couldn\'t find "tessdata" directory.') - chdir(tess_path) self._tess = TesseractHandler(lib_path=lib_path, data_path=data_path, language=language) - chdir(run_path) def justread(self, raw_image_ctypes, width, height, bytes_per_pixel, - bytes_per_line, resolution=96): + bytes_per_line, resolution=96, config=None): """ Reads text as utf-8 string from raw image data without any check ---------------------------------------------------------------- @@ -342,7 +342,7 @@ def justread(self, raw_image_ctypes, width, height, bytes_per_pixel, self._tess.set_image(raw_image_ctypes, width, height, bytes_per_pixel, bytes_per_line, resolution) - return self._tess.get_text() + return self._tess.get_text(config=config) @@ -369,7 +369,7 @@ def justread_raw(self, raw_image_ctypes, width, height, bytes_per_pixel, def read(self, imagedata, width, height, bytes_per_pixel, resolution=96, - raw=False): + raw=False, config=None): """ Reads text from image data -------------------------- @@ -390,7 +390,7 @@ def read(self, imagedata, width, height, bytes_per_pixel, resolution=96, bytes_per_line, resolution) else: return self.justread(imagedata, width, height, bytes_per_pixel, - bytes_per_line, resolution) + bytes_per_line, resolution, config=config) From 5fbd627b58d3ccbbaba9111b291258170d3cf58b Mon Sep 17 00:00:00 2001 From: sprovence Date: Tue, 8 Jun 2021 11:59:48 -0500 Subject: [PATCH 09/10] Added set_variable in TesseractHandler, passes oem, psm, char_whitelist in Pytessy init --- source/pytessy.py | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/source/pytessy.py b/source/pytessy.py index 0417d56..927f330 100644 --- a/source/pytessy.py +++ b/source/pytessy.py @@ -91,7 +91,7 @@ def __init__(self, lib_path=None, data_path=None, language='eng'): - def get_text(self, config=None): + def get_text(self): """ Gets text as utf-8 decoded string --------------------------------- @@ -99,12 +99,11 @@ def get_text(self, config=None): """ self._check_setup() - if config: - result = self._lib.TessBaseAPIGetUTF8Text(self._api, config=config) - else: - result = self._lib.TessBaseAPIGetUTF8Text(self._api) + result = self._lib.TessBaseAPIGetUTF8Text(self._api) if result: return result.decode('utf-8') + else: + return "" @@ -140,6 +139,17 @@ def set_image(self, imagedata, width, height, bytes_per_pixel, bytes_per_line, imagedata, width, height, bytes_per_pixel, bytes_per_line) self._lib.TessBaseAPISetSourceResolution(self._api, resolution) + + + def set_variable(self, key, val): + """ + Sets a variable in Tesseract + ---------- + @Params: key + val : TYPE + """ + self._check_setup() + self._lib.TessBaseAPISetVariable(self._api, key, val) @@ -175,7 +185,11 @@ def setup_lib(cls, lib_path=None): ctypes.c_int, # height ctypes.c_int, # bytes_per_pixel ctypes.c_int) # bytes_per_line - + + lib.TessBaseAPISetVariable.argtypes = (cls.TessBaseAPI, + ctypes.c_char_p, + ctypes.c_char_p) + lib.TessBaseAPIGetUTF8Text.restype = ctypes.c_char_p # text lib.TessBaseAPIGetUTF8Text.argtypes = (cls.TessBaseAPI, ) # handle @@ -230,7 +244,8 @@ class PyTessy(object): def __init__(self, tesseract_path=None, api_version=None, lib_path=None, - data_path=None, language='eng', verbose_search=False): + data_path=None, language='eng', verbose_search=False, + oem=1, psm=7, char_whitelist=None): """ Initializes PyTessy instance ---------------------------- @@ -321,11 +336,16 @@ def __init__(self, tesseract_path=None, api_version=None, lib_path=None, raise FileNotFoundError('PyTessy: Couldn\'t find "tessdata" directory.') self._tess = TesseractHandler(lib_path=lib_path, data_path=data_path, language=language) + self._tess.set_variable(b"tessedit_pageseg_mode", bytes(psm)) + self._tess.set_variable(b"tessedit_ocr_engine_mode", bytes(oem)) + if char_whitelist: + self._tess.set_variable(b"tessedit_char_whitelist", char_whitelist) + def justread(self, raw_image_ctypes, width, height, bytes_per_pixel, - bytes_per_line, resolution=96, config=None): + bytes_per_line, resolution=96): """ Reads text as utf-8 string from raw image data without any check ---------------------------------------------------------------- @@ -342,7 +362,7 @@ def justread(self, raw_image_ctypes, width, height, bytes_per_pixel, self._tess.set_image(raw_image_ctypes, width, height, bytes_per_pixel, bytes_per_line, resolution) - return self._tess.get_text(config=config) + return self._tess.get_text() @@ -369,7 +389,7 @@ def justread_raw(self, raw_image_ctypes, width, height, bytes_per_pixel, def read(self, imagedata, width, height, bytes_per_pixel, resolution=96, - raw=False, config=None): + raw=False): """ Reads text from image data -------------------------- @@ -390,7 +410,7 @@ def read(self, imagedata, width, height, bytes_per_pixel, resolution=96, bytes_per_line, resolution) else: return self.justread(imagedata, width, height, bytes_per_pixel, - bytes_per_line, resolution, config=config) + bytes_per_line, resolution) From fc4824b50993ee670b98cb81561b9f0b77e4bffa Mon Sep 17 00:00:00 2001 From: Alan J Castonguay Date: Tue, 8 Jun 2021 19:01:55 -0400 Subject: [PATCH 10/10] Cite project urls and license in setup.cfg Cite project urls and license in setup.cfg --- setup.cfg | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 787f656..9b1d7da 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,9 +1,19 @@ [metadata] name = pytessy -license = Internal +author = hyperrixel +url = https://github.com/hyperrixel/pytessy +project_urls = + Bug Tracker = https://github.com/hyperrixel/pytessy/issues + Documentation = https://pytessy.readthedocs.io/ + Source Code = https://github.com/hyperrixel/pytessy + + +license = Boost Software License 1.0 long_description = file: README.md long_description_content_type = text/markdown; charset=UTF-8 platform = any +license_files = + LICENSE [options] name = pytessy