@@ -4299,4 +4299,242 @@ def create_and_edit_screenshot(data_set):
42994299 return "passed"
43004300 except :
43014301 CommonUtil .ExecLog (sModuleInfo , "Couldn't take screenshot or edit the image" , 3 )
4302+ return "zeuz_failed"
4303+
4304+
4305+ @logger
4306+ def extract_text_by_page (pdf_path , text , pgn = None ):
4307+ sModuleInfo = inspect .currentframe ().f_code .co_name + " : " + MODULE_NAME
4308+ try :
4309+ from pdfminer .converter import TextConverter
4310+ from pdfminer .layout import LAParams
4311+ from pdfminer .pdfdocument import PDFDocument
4312+ from pdfminer .pdfinterp import PDFResourceManager , PDFPageInterpreter
4313+ from pdfminer .pdfpage import PDFPage
4314+ from pdfminer .pdfparser import PDFParser
4315+ from pdfminer .high_level import extract_text
4316+ import re
4317+ import io
4318+ except :
4319+ CommonUtil .ExecLog (sModuleInfo , "Couldn't import the libraries" , 3 )
4320+ return "zeuz_failed"
4321+
4322+ try :
4323+ output = io .StringIO ()
4324+ pattern = re .compile (rf"{ text } " )
4325+
4326+ if pgn != None :
4327+ with open (pdf_path , 'rb' ) as file :
4328+ parser = PDFParser (file )
4329+ doc = PDFDocument (parser )
4330+ rsrcmgr = PDFResourceManager ()
4331+ device = TextConverter (rsrcmgr , output , laparams = LAParams ())
4332+ interpreter = PDFPageInterpreter (rsrcmgr , device )
4333+
4334+ for i ,page in enumerate (PDFPage .create_pages (doc )):
4335+ interpreter .process_page (page )
4336+ full_text = output .getvalue ()
4337+ if i + 1 == pgn :
4338+ CommonUtil .ExecLog (sModuleInfo , f"Searching in page Number { i + 1 } " , 1 )
4339+ break
4340+ output .truncate (0 )
4341+ output .seek (0 )
4342+
4343+ device .close ()
4344+ else :
4345+ CommonUtil .ExecLog (sModuleInfo , "Searching in all over the PDF" , 1 )
4346+ full_text = extract_text (pdf_path )
4347+
4348+ output .close ()
4349+ matches = pattern .findall (full_text )
4350+ return matches
4351+ except :
4352+ CommonUtil .ExecLog (sModuleInfo , "[extract_text_by_page] Couldn't extract data from PDF" , 3 )
4353+ return "zeuz_failed"
4354+
4355+
4356+ @logger
4357+ def extract_text_pdf (dataset ):
4358+ """
4359+ This action lets you extract specific text/string from a PDF file. You can specify the page number where
4360+ you want to coduct the extraction. You can also use regular expressions to extract a pattern of text/string.
4361+
4362+ filename | input parameter | Path to the PDF file.
4363+ text | input parameter | The text that you want to extract. Can be either string or regular expression pattern.
4364+ page | optional parameter | Page number where you want to execute the extraction
4365+ extract text pdf | utility action | The variable name that will store the extracted strings
4366+ """
4367+
4368+ sModuleInfo = inspect .currentframe ().f_code .co_name + " : " + MODULE_NAME
4369+ CommonUtil .ExecLog (sModuleInfo , "Function Start" , 0 )
4370+
4371+ try :
4372+ # steps to install the Pdfminer module
4373+ pip_command = ['pip' , 'list' ]
4374+ pdfminer_search_command = ['findstr' , 'pdfminer.six' ]
4375+
4376+ pip_process = subprocess .Popen (pip_command , stdout = subprocess .PIPE )
4377+ search_process = subprocess .Popen (pdfminer_search_command , stdin = pip_process .stdout , stdout = subprocess .PIPE ,
4378+ text = True )
4379+
4380+ is_pdfminer , _ = search_process .communicate ()
4381+
4382+ if is_pdfminer :
4383+ CommonUtil .ExecLog (
4384+ sModuleInfo ,
4385+ "Pdfminer is already installed" ,
4386+ 5 ,
4387+ )
4388+ else :
4389+ CommonUtil .ExecLog (
4390+ sModuleInfo ,
4391+ "Installing Pdfminer" ,
4392+ 5 ,
4393+ )
4394+ subprocess .run (['pip' , 'install' , '--trusted-host' , 'pypi.org' , '--trusted-host' , 'files.pythonhosted.org' , 'pdfminer.six' ], input = 'y\n ' , text = True )
4395+ except :
4396+ CommonUtil .ExecLog (
4397+ sModuleInfo ,
4398+ "Could not install the Pdfminer module" ,
4399+ 3 ,
4400+ )
4401+
4402+ filename = None
4403+ pg = None
4404+ text = None
4405+ var_name = "pdf_data"
4406+
4407+ try :
4408+ for left , mid , right in dataset :
4409+ if left .strip ().lower () == "filename" :
4410+ filename = right .strip ()
4411+ elif left .strip ().lower () == "page" :
4412+ pg = int (right .strip ())
4413+ elif left .strip ().lower () == "text" :
4414+ text = right .strip ()
4415+ elif mid .strip ().lower () == "action" :
4416+ var_name = right .strip ()
4417+ except :
4418+ CommonUtil .ExecLog (sModuleInfo , "Couldn't parse the dataset" , 3 )
4419+ return "zeuz_failed"
4420+
4421+ if filename == None or text == None :
4422+ CommonUtil .ExecLog (sModuleInfo , "filename or text field cannot be empty" , 3 )
4423+ return "zeuz_failed"
4424+
4425+ try :
4426+ data = extract_text_by_page (filename , text , pg )
4427+ Shared_Resources .Set_Shared_Variables (var_name , data )
4428+ return "passed"
4429+ except :
4430+ CommonUtil .ExecLog (sModuleInfo , "[extract_text_pdf] Couldn't extract data from PDF" , 3 )
4431+ return "zeuz_failed"
4432+
4433+
4434+ @logger
4435+ def extract_table_pdf (dataset ):
4436+ """
4437+ This action lets you extract tabular data from PDF. You can also specify some additional parameters
4438+ to make the extraction more precise, for example, which page the table should be extracted from,
4439+ if there are multiple tables then you can select a single specific table, you can add filters to
4440+ narrow your search, specify the row and column to get the data
4441+
4442+ filename | input parameter | Path to the PDF file.
4443+ page | optional parameter | Page number where you want to execute the extraction
4444+ index | optional parameter | Specify which table you want to extract
4445+ filter | optional parameter | Add filter to the table just like we do in Pandas
4446+ row index | optional parameter | Row numbers holding the data
4447+ column index | optional parameter | Column number holding the data
4448+ extract table pdf | utility action | The variable name that will store the extracted strings
4449+ """
4450+ sModuleInfo = inspect .currentframe ().f_code .co_name + " : " + MODULE_NAME
4451+
4452+ try :
4453+ # steps to install the Tabula module
4454+ pip_command = ['pip' , 'list' ]
4455+ tabula_search_command = ['findstr' , 'tabula-py' ]
4456+
4457+ pip_process = subprocess .Popen (pip_command , stdout = subprocess .PIPE )
4458+ search_process = subprocess .Popen (tabula_search_command , stdin = pip_process .stdout , stdout = subprocess .PIPE ,
4459+ text = True )
4460+
4461+ is_tabula , _ = search_process .communicate ()
4462+
4463+ if is_tabula :
4464+ CommonUtil .ExecLog (
4465+ sModuleInfo ,
4466+ "Tabula is already installed" ,
4467+ 5 ,
4468+ )
4469+ else :
4470+ CommonUtil .ExecLog (
4471+ sModuleInfo ,
4472+ "Installing Tabula" ,
4473+ 5 ,
4474+ )
4475+ subprocess .run (['pip' , 'install' , '--trusted-host' , 'pypi.org' , '--trusted-host' , 'files.pythonhosted.org' , 'tabula-py' ], input = 'y\n ' , text = True )
4476+ except :
4477+ CommonUtil .ExecLog (
4478+ sModuleInfo ,
4479+ "Could not install the Tabula module" ,
4480+ 3 ,
4481+ )
4482+
4483+ try :
4484+ import tabula
4485+ import warnings
4486+ # Suppress FutureWarnings
4487+ warnings .simplefilter (action = 'ignore' , category = FutureWarning )
4488+ except :
4489+ CommonUtil .ExecLog (sModuleInfo , "Couldn't import the libraries" , 3 )
4490+ return "zeuz_failed"
4491+
4492+ filename = None
4493+ pg = "all"
4494+ index = 0
4495+ filter = None
4496+ row_index = "0"
4497+ col_index = "0"
4498+ var_name = "pdf_table_data"
4499+ global df , vals
4500+
4501+ try :
4502+ for left , mid , right in dataset :
4503+ if left .strip ().lower () == "filename" :
4504+ filename = right .strip ()
4505+ elif left .strip ().lower () == "page" :
4506+ pg = right .strip ()
4507+ elif left .strip ().lower () == "index" :
4508+ index = int (right .strip ())
4509+ elif left .strip ().lower () == "filter" :
4510+ filter = right .strip ()
4511+ elif left .strip ().lower () == "row index" :
4512+ row_index = right .strip ()
4513+ elif left .strip ().lower () == "column index" :
4514+ col_index = right .strip ()
4515+ elif mid .strip ().lower () == "action" :
4516+ var_name = right .strip ()
4517+ except :
4518+ CommonUtil .ExecLog (sModuleInfo , "Couldn't parse the dataset" , 3 )
4519+ return "zeuz_failed"
4520+
4521+ try :
4522+ df = tabula .read_pdf (filename , pages = pg )[index ]
4523+ except :
4524+ CommonUtil .ExecLog (sModuleInfo , "Could not read the table" , 3 )
4525+ return "zeuz_failed"
4526+
4527+ if filter != None :
4528+ exec (f'df = df[{ filter } ]' , globals ())
4529+
4530+ try :
4531+ if row_index .isnumeric ():
4532+ exec (f'vals = [df.iloc[{ row_index } , { col_index } ]]' , globals ())
4533+ else :
4534+ exec (f'vals = df.iloc[{ row_index } , { col_index } ].to_list()' , globals ())
4535+
4536+ Shared_Resources .Set_Shared_Variables (var_name , vals )
4537+ return "passed"
4538+ except :
4539+ CommonUtil .ExecLog (sModuleInfo , "Could not extract data" , 3 )
43024540 return "zeuz_failed"
0 commit comments