Skip to content

Commit da7767d

Browse files
authored
Merge pull request #413 from AutomationSolutionz/pdf_actions
Pdf actions
2 parents abbda5f + 746c779 commit da7767d

3 files changed

Lines changed: 243 additions & 3 deletions

File tree

Framework/Built_In_Automation/Built_In_Utility/CrossPlatform/BuiltInUtilityFunction.py

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4299,4 +4299,242 @@ def create_and_edit_screenshot(data_set):
42994299
return "passed"
43004300
except:
43014301
CommonUtil.ExecLog(sModuleInfo, "Couldn't take screenshot or edit the image", 3)
4302+
return "zeuz_failed"
4303+
4304+
4305+
@logger
4306+
def extract_text_by_page(pdf_path, text, pgn=None):
4307+
sModuleInfo = inspect.currentframe().f_code.co_name + " : " + MODULE_NAME
4308+
try:
4309+
from pdfminer.converter import TextConverter
4310+
from pdfminer.layout import LAParams
4311+
from pdfminer.pdfdocument import PDFDocument
4312+
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
4313+
from pdfminer.pdfpage import PDFPage
4314+
from pdfminer.pdfparser import PDFParser
4315+
from pdfminer.high_level import extract_text
4316+
import re
4317+
import io
4318+
except:
4319+
CommonUtil.ExecLog(sModuleInfo, "Couldn't import the libraries", 3)
4320+
return "zeuz_failed"
4321+
4322+
try:
4323+
output = io.StringIO()
4324+
pattern = re.compile(rf"{text}")
4325+
4326+
if pgn != None:
4327+
with open(pdf_path, 'rb') as file:
4328+
parser = PDFParser(file)
4329+
doc = PDFDocument(parser)
4330+
rsrcmgr = PDFResourceManager()
4331+
device = TextConverter(rsrcmgr, output, laparams=LAParams())
4332+
interpreter = PDFPageInterpreter(rsrcmgr, device)
4333+
4334+
for i,page in enumerate(PDFPage.create_pages(doc)):
4335+
interpreter.process_page(page)
4336+
full_text = output.getvalue()
4337+
if i+1 == pgn:
4338+
CommonUtil.ExecLog(sModuleInfo, f"Searching in page Number {i+1}", 1)
4339+
break
4340+
output.truncate(0)
4341+
output.seek(0)
4342+
4343+
device.close()
4344+
else:
4345+
CommonUtil.ExecLog(sModuleInfo, "Searching in all over the PDF", 1)
4346+
full_text = extract_text(pdf_path)
4347+
4348+
output.close()
4349+
matches = pattern.findall(full_text)
4350+
return matches
4351+
except:
4352+
CommonUtil.ExecLog(sModuleInfo, "[extract_text_by_page] Couldn't extract data from PDF", 3)
4353+
return "zeuz_failed"
4354+
4355+
4356+
@logger
4357+
def extract_text_pdf(dataset):
4358+
"""
4359+
This action lets you extract specific text/string from a PDF file. You can specify the page number where
4360+
you want to coduct the extraction. You can also use regular expressions to extract a pattern of text/string.
4361+
4362+
filename | input parameter | Path to the PDF file.
4363+
text | input parameter | The text that you want to extract. Can be either string or regular expression pattern.
4364+
page | optional parameter | Page number where you want to execute the extraction
4365+
extract text pdf | utility action | The variable name that will store the extracted strings
4366+
"""
4367+
4368+
sModuleInfo = inspect.currentframe().f_code.co_name + " : " + MODULE_NAME
4369+
CommonUtil.ExecLog(sModuleInfo, "Function Start", 0)
4370+
4371+
try:
4372+
# steps to install the Pdfminer module
4373+
pip_command = ['pip', 'list']
4374+
pdfminer_search_command = ['findstr', 'pdfminer.six']
4375+
4376+
pip_process = subprocess.Popen(pip_command, stdout=subprocess.PIPE)
4377+
search_process = subprocess.Popen(pdfminer_search_command, stdin=pip_process.stdout, stdout=subprocess.PIPE,
4378+
text=True)
4379+
4380+
is_pdfminer, _ = search_process.communicate()
4381+
4382+
if is_pdfminer:
4383+
CommonUtil.ExecLog(
4384+
sModuleInfo,
4385+
"Pdfminer is already installed",
4386+
5,
4387+
)
4388+
else:
4389+
CommonUtil.ExecLog(
4390+
sModuleInfo,
4391+
"Installing Pdfminer",
4392+
5,
4393+
)
4394+
subprocess.run(['pip', 'install', '--trusted-host', 'pypi.org', '--trusted-host', 'files.pythonhosted.org', 'pdfminer.six'], input='y\n', text=True)
4395+
except:
4396+
CommonUtil.ExecLog(
4397+
sModuleInfo,
4398+
"Could not install the Pdfminer module",
4399+
3,
4400+
)
4401+
4402+
filename = None
4403+
pg = None
4404+
text = None
4405+
var_name = "pdf_data"
4406+
4407+
try:
4408+
for left, mid, right in dataset:
4409+
if left.strip().lower() == "filename":
4410+
filename = right.strip()
4411+
elif left.strip().lower() == "page":
4412+
pg = int(right.strip())
4413+
elif left.strip().lower() == "text":
4414+
text = right.strip()
4415+
elif mid.strip().lower() == "action":
4416+
var_name = right.strip()
4417+
except:
4418+
CommonUtil.ExecLog(sModuleInfo, "Couldn't parse the dataset", 3)
4419+
return "zeuz_failed"
4420+
4421+
if filename == None or text == None:
4422+
CommonUtil.ExecLog(sModuleInfo, "filename or text field cannot be empty", 3)
4423+
return "zeuz_failed"
4424+
4425+
try:
4426+
data = extract_text_by_page(filename, text, pg)
4427+
Shared_Resources.Set_Shared_Variables(var_name, data)
4428+
return "passed"
4429+
except:
4430+
CommonUtil.ExecLog(sModuleInfo, "[extract_text_pdf] Couldn't extract data from PDF", 3)
4431+
return "zeuz_failed"
4432+
4433+
4434+
@logger
4435+
def extract_table_pdf(dataset):
4436+
"""
4437+
This action lets you extract tabular data from PDF. You can also specify some additional parameters
4438+
to make the extraction more precise, for example, which page the table should be extracted from,
4439+
if there are multiple tables then you can select a single specific table, you can add filters to
4440+
narrow your search, specify the row and column to get the data
4441+
4442+
filename | input parameter | Path to the PDF file.
4443+
page | optional parameter | Page number where you want to execute the extraction
4444+
index | optional parameter | Specify which table you want to extract
4445+
filter | optional parameter | Add filter to the table just like we do in Pandas
4446+
row index | optional parameter | Row numbers holding the data
4447+
column index | optional parameter | Column number holding the data
4448+
extract table pdf | utility action | The variable name that will store the extracted strings
4449+
"""
4450+
sModuleInfo = inspect.currentframe().f_code.co_name + " : " + MODULE_NAME
4451+
4452+
try:
4453+
# steps to install the Tabula module
4454+
pip_command = ['pip', 'list']
4455+
tabula_search_command = ['findstr', 'tabula-py']
4456+
4457+
pip_process = subprocess.Popen(pip_command, stdout=subprocess.PIPE)
4458+
search_process = subprocess.Popen(tabula_search_command, stdin=pip_process.stdout, stdout=subprocess.PIPE,
4459+
text=True)
4460+
4461+
is_tabula, _ = search_process.communicate()
4462+
4463+
if is_tabula:
4464+
CommonUtil.ExecLog(
4465+
sModuleInfo,
4466+
"Tabula is already installed",
4467+
5,
4468+
)
4469+
else:
4470+
CommonUtil.ExecLog(
4471+
sModuleInfo,
4472+
"Installing Tabula",
4473+
5,
4474+
)
4475+
subprocess.run(['pip', 'install', '--trusted-host', 'pypi.org', '--trusted-host', 'files.pythonhosted.org', 'tabula-py'], input='y\n', text=True)
4476+
except:
4477+
CommonUtil.ExecLog(
4478+
sModuleInfo,
4479+
"Could not install the Tabula module",
4480+
3,
4481+
)
4482+
4483+
try:
4484+
import tabula
4485+
import warnings
4486+
# Suppress FutureWarnings
4487+
warnings.simplefilter(action='ignore', category=FutureWarning)
4488+
except:
4489+
CommonUtil.ExecLog(sModuleInfo, "Couldn't import the libraries", 3)
4490+
return "zeuz_failed"
4491+
4492+
filename = None
4493+
pg = "all"
4494+
index = 0
4495+
filter = None
4496+
row_index = "0"
4497+
col_index = "0"
4498+
var_name = "pdf_table_data"
4499+
global df, vals
4500+
4501+
try:
4502+
for left, mid, right in dataset:
4503+
if left.strip().lower() == "filename":
4504+
filename = right.strip()
4505+
elif left.strip().lower() == "page":
4506+
pg = right.strip()
4507+
elif left.strip().lower() == "index":
4508+
index = int(right.strip())
4509+
elif left.strip().lower() == "filter":
4510+
filter = right.strip()
4511+
elif left.strip().lower() == "row index":
4512+
row_index = right.strip()
4513+
elif left.strip().lower() == "column index":
4514+
col_index = right.strip()
4515+
elif mid.strip().lower() == "action":
4516+
var_name = right.strip()
4517+
except:
4518+
CommonUtil.ExecLog(sModuleInfo, "Couldn't parse the dataset", 3)
4519+
return "zeuz_failed"
4520+
4521+
try:
4522+
df = tabula.read_pdf(filename, pages=pg)[index]
4523+
except:
4524+
CommonUtil.ExecLog(sModuleInfo, "Could not read the table", 3)
4525+
return "zeuz_failed"
4526+
4527+
if filter != None:
4528+
exec(f'df = df[{filter}]', globals())
4529+
4530+
try:
4531+
if row_index.isnumeric():
4532+
exec(f'vals = [df.iloc[{row_index}, {col_index}]]', globals())
4533+
else:
4534+
exec(f'vals = df.iloc[{row_index}, {col_index}].to_list()', globals())
4535+
4536+
Shared_Resources.Set_Shared_Variables(var_name, vals)
4537+
return "passed"
4538+
except:
4539+
CommonUtil.ExecLog(sModuleInfo, "Could not extract data", 3)
43024540
return "zeuz_failed"

Framework/Built_In_Automation/Sequential_Actions/action_declarations/utility.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,10 @@
3737
{ "name": "convert date format", "function": "convert_date_format", "screenshot": "none" },
3838
{ "name": "compare images", "function": "compare_images", "screenshot": "none" },
3939
{ "name": "datatype conversion", "function": "datatype_conversion", "screenshot": "none" },
40-
{"name": "new compare images", "function": "new_compare_images", "screenshot": "none"},
41-
{"name": "create and edit screenshot", "function": "create_and_edit_screenshot", "screenshot": "none"},
40+
{ "name": "new compare images", "function": "new_compare_images", "screenshot": "none"},
41+
{ "name": "extract text pdf", "function": "extract_text_pdf", "screenshot": "none"},
42+
{ "name": "extract table pdf", "function": "extract_table_pdf", "screenshot": "none"},
43+
{ "name": "create and edit screenshot", "function": "create_and_edit_screenshot", "screenshot": "none"},
4244
) # yapf: disable
4345

4446
module_name = "utility"

Framework/Utilities/decorators.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def wrapper(*args, **kwargs):
1313
CommonUtil.ExecLog(None, f"Entering into function: {func.__name__!r}.", 5)
1414
custom_fail_message = ""
1515
result = func(*args, **kwargs)
16-
if result in failed_tag_list:
16+
if isinstance(result, str) and result in failed_tag_list:
1717
try:
1818
for row in args[0]:
1919
if row[1].replace(" ", "").lower() == "failmessage":

0 commit comments

Comments
 (0)