Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@
uses: mindee/mindee-api-python/.github/workflows/_test-regressions.yml@main
secrets: inherit
test-code-samples:
uses: mindee/mindee-api-python/.github/workflows/_smoke_test.yml@main
uses: mindee/mindee-api-python/.github/workflows/_smoke-test.yml@main
secrets: inherit

Check warning

Code scanning / CodeQL

Workflow does not contain permissions Medium

Actions job or workflow does not limit the permissions of the GITHUB_TOKEN. Consider setting an explicit permissions block, using the following as a minimal starting point: {}
4 changes: 3 additions & 1 deletion examples/auto_multi_receipts_extraction_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ def parse_receipts(input_path):
extracted_receipts = extract_receipts(input_doc, result_split.document.inference)

for idx, receipt in enumerate(extracted_receipts, 1):
result_receipt = mindee_client.parse(product.ReceiptV5, receipt.as_source())
result_receipt = mindee_client.parse(
product.ReceiptV5, receipt.as_input_source()
)
print(f"Receipt {idx}:")
print(result_receipt.document)
print("-" * 40)
Expand Down
20 changes: 15 additions & 5 deletions mindee/extraction/common/extracted_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ class ExtractedImage:
"""Id of the page the image was extracted from."""
_element_id: int
"""Id of the element on a given page."""
filename: str
"""Name of the file the image was extracted from."""

def __init__(
self, input_source: LocalInputSource, page_id: int, element_id: int
Expand All @@ -30,6 +32,7 @@ def __init__(
"""
self.buffer = io.BytesIO(input_source.file_object.read())
self.buffer.name = input_source.filename
self.filename = input_source.filename
if input_source.is_pdf():
extension = "jpg"
else:
Expand All @@ -56,20 +59,27 @@ def save_to_file(self, output_path: str, file_format: Optional[str] = None):
if not file_format:
if len(resolved_path.suffix) < 1:
raise ValueError("Invalid file format.")
file_format = (
resolved_path.suffix.upper()
) # technically redundant since PIL applies an upper operation
# to the parameter , but older versions may not do so.
# Let PIL infer format from filename extension
self.buffer.seek(0)
image = Image.open(self.buffer)
image.save(resolved_path, format=file_format)
if file_format:
image.save(resolved_path, format=file_format)
else:
image.save(resolved_path)
logger.info("File saved successfully to '%s'.", resolved_path)
except TypeError as exc:
raise MindeeError("Invalid path/filename provided.") from exc
except Exception as exc:
print(exc)
raise MindeeError(f"Could not save file {Path(output_path).name}.") from exc

def as_source(self) -> FileInput:
"""
Deprecated. Use ``as_input_source`` instead.
"""
return self.as_input_source()

def as_input_source(self) -> FileInput:
"""
Return the file as a Mindee-compatible BufferInput source.

Expand Down
8 changes: 5 additions & 3 deletions mindee/extraction/common/image_extractor.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import io
from typing import BinaryIO, List
from typing import BinaryIO, List, Union

import pypdfium2 as pdfium
from PIL import Image

from mindee.error.mindee_error import MindeeError
from mindee.extraction.common.extracted_image import ExtractedImage
from mindee.geometry.point import Point
from mindee.geometry.polygon import get_min_max_x, get_min_max_y
from mindee.geometry.polygon import Polygon, get_min_max_x, get_min_max_y
from mindee.input.sources.bytes_input import BytesInput
from mindee.input.sources.local_input_source import LocalInputSource

Expand Down Expand Up @@ -114,7 +114,9 @@ def get_file_extension(file_format: str):


def extract_multiple_images_from_source(
input_source: LocalInputSource, page_id: int, polygons: List[List[Point]]
input_source: LocalInputSource,
page_id: int,
polygons: Union[List[Polygon], List[List[Point]]],
Comment thread
sebastianMindee marked this conversation as resolved.
Outdated
) -> List[ExtractedImage]:
"""
Extracts elements from a page based on a list of bounding boxes.
Expand Down
5 changes: 5 additions & 0 deletions mindee/extraction/pdf_extractor/extracted_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ def get_page_count(self) -> int:
) from exc

def write_to_file(self, output_path: str):
"""Deprecated. Use ``save_to_file`` instead."""
self.save_to_file(output_path)

def save_to_file(self, output_path: str):
"""
Writes the contents of the current PDF object to a file.

Expand All @@ -40,6 +44,7 @@ def write_to_file(self, output_path: str):
raise MindeeError("Invalid save path provided {}.")
if out_path.suffix.lower() != "pdf":
out_path = out_path.parent / (out_path.stem + "." + "pdf")
self.pdf_bytes.seek(0)
with open(out_path, "wb") as out_file:
out_file.write(self.pdf_bytes.read())

Expand Down
4 changes: 3 additions & 1 deletion mindee/mindee_http/mindee_api_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import requests

from mindee.error.mindee_error import MindeeApiV2Error
from mindee.input import LocalInputSource, UrlInputSource, BaseParameters
from mindee.input.base_parameters import BaseParameters
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.input.sources.url_input_source import UrlInputSource
from mindee.logger import logger
from mindee.mindee_http.base_settings import USER_AGENT
from mindee.mindee_http.settings_mixin import SettingsMixin
Expand Down
4 changes: 3 additions & 1 deletion mindee/mindee_http/workflow_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

import requests

from mindee.input import LocalInputSource, UrlInputSource, WorkflowOptions
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.input.sources.url_input_source import UrlInputSource
from mindee.input.workflow_options import WorkflowOptions
from mindee.mindee_http.base_endpoint import BaseEndpoint
from mindee.mindee_http.workflow_settings import WorkflowSettings

Expand Down
4 changes: 4 additions & 0 deletions mindee/v2/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from mindee.v2.file_operations.split import Split
from mindee.v2.file_operations.crop import Crop
from mindee.v2.product.classification.classification_parameters import (
ClassificationParameters,
)
Expand All @@ -14,10 +16,12 @@
__all__ = [
"ClassificationResponse",
"ClassificationParameters",
"Crop",
"CropResponse",
"CropParameters",
"OCRResponse",
"OCRParameters",
"Split",
"SplitResponse",
"SplitParameters",
]
4 changes: 4 additions & 0 deletions mindee/v2/file_operations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from mindee.v2.file_operations.crop import Crop
from mindee.v2.file_operations.split import Split

__all__ = ["Crop", "Split"]
69 changes: 69 additions & 0 deletions mindee/v2/file_operations/crop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from typing import List

from mindee.error import MindeeError
from mindee.extraction import ExtractedImage, extract_multiple_images_from_source
from mindee.geometry import Polygon
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.parsing.v2.field import FieldLocation
from mindee.v2.product.crop.crop_box import CropBox


class Crop:
Comment thread
sebastianMindee marked this conversation as resolved.
Outdated
"""Crop operations for V2."""

@classmethod
def extract_single_crop(
cls, input_source: LocalInputSource, crop: FieldLocation
) -> ExtractedImage:
"""
Extracts a single crop as complete PDFs from the document.

:param input_source: Local Input Source to extract sub-receipts from.
:param crop: Crop to extract.
:return: ExtractedImage.
"""

return extract_multiple_images_from_source(
input_source, crop.page, [crop.polygon]
)[0]

@classmethod
def extract_crops(
cls, input_source: LocalInputSource, crops: List[CropBox]
) -> List[ExtractedImage]:
"""
Extracts individual receipts from multi-receipts documents.

:param input_source: Local Input Source to extract sub-receipts from.
:param crops: List of crops.
:return: Individual extracted receipts as an array of ExtractedImage.
"""
images: List[ExtractedImage] = []
if not crops:
raise MindeeError("No possible candidates found for Crop extraction.")
polygons: List[List[Polygon]] = [[] for _ in range(input_source.page_count)]
for i, crop in enumerate(crops):
polygons[crop.location.page].append(crop.location.polygon)
for i, polygon in enumerate(polygons):
images.extend(
extract_multiple_images_from_source(
input_source,
i,
polygon,
)
)
return images

@classmethod
def apply(
cls,
input_source: LocalInputSource,
crops: List[CropBox],
) -> List[ExtractedImage]:
"""Crop a document into multiple pages.

:param input_source: Input source to crop.
:param crops: List of crops.
"""

return cls.extract_crops(input_source, crops)
49 changes: 49 additions & 0 deletions mindee/v2/file_operations/split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from typing import List, Union

from mindee.error import MindeeError
from mindee.extraction import ExtractedPdf, PdfExtractor
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.v2.product.split.split_range import SplitRange


class Split:
Comment thread
sebastianMindee marked this conversation as resolved.
Outdated
"""Split operations for V2."""

@classmethod
def extract_splits(
cls,
input_source: LocalInputSource,
splits: Union[List[SplitRange], List[List[int]]],
) -> List[ExtractedPdf]:
"""
Extracts splits as complete PDFs from the document.

:param input_source: Input source to split.
:param splits: List of sub-lists of pages to keep.
:return: A list of extracted invoices.
"""
pdf_extractor = PdfExtractor(input_source)
page_groups = []
for split in splits:
if isinstance(split, SplitRange):
lower_bound = split.page_range[0]
upper_bound = split.page_range[1]
else:
lower_bound = split[0]
upper_bound = split[1]
page_groups.append(list(range(lower_bound, upper_bound + 1)))
if len(splits) < 1:
raise MindeeError("No indexes provided.")
return pdf_extractor.extract_sub_documents(page_groups)

@classmethod
def apply(
cls, input_source: LocalInputSource, splits: List[SplitRange]
) -> List[ExtractedPdf]:
"""Split a document into multiple pages.

:param input_source: Input source to split.
:param splits: List of splits.
"""

return cls.extract_splits(input_source, splits)
13 changes: 13 additions & 0 deletions mindee/v2/product/crop/crop_box.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from mindee.extraction import ExtractedImage, extract_multiple_images_from_source
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.parsing.common.string_dict import StringDict
from mindee.parsing.v2.field.field_location import FieldLocation

Expand All @@ -16,3 +18,14 @@ def __init__(self, server_response: StringDict):

def __str__(self) -> str:
return f"* :Location: {self.location}\n :Object Type: {self.object_type}"

def apply_to_file(self, input_source: LocalInputSource) -> ExtractedImage:
Comment thread
sebastianMindee marked this conversation as resolved.
Outdated
"""
Apply the split range inference to a file and return a single extracted PDF.

:param input_source: Local file to apply the inference to
:return: Extracted PDF
"""
return extract_multiple_images_from_source(
input_source, self.location.page, [self.location.polygon]
)[0]
35 changes: 35 additions & 0 deletions mindee/v2/product/crop/crop_response.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
from typing import List

from mindee.error import MindeeError
from mindee.extraction.common.extracted_image import ExtractedImage
from mindee.extraction.common.image_extractor import extract_multiple_images_from_source
from mindee.geometry import Polygon
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.parsing.common.string_dict import StringDict
from mindee.v2.parsing.inference import BaseResponse
from mindee.v2.product.crop.crop_inference import CropInference
Expand All @@ -15,3 +22,31 @@ class CropResponse(BaseResponse):
def __init__(self, raw_response: StringDict) -> None:
super().__init__(raw_response)
self.inference = CropInference(raw_response["inference"])

def apply_to_file(self, input_source: LocalInputSource) -> List[ExtractedImage]:
Comment thread
sebastianMindee marked this conversation as resolved.
Outdated
"""
Apply the crop inference to a file and return a list of extracted images.

:param input_source: Local file to apply the inference to
:return: List of extracted PDFs
"""
crops = self.inference.result.crops
if not crops:
raise MindeeError("No possible candidates found for Crop extraction.")

polygons: List[List[Polygon]] = [[] for _ in range(input_source.page_count)]
for crop in crops:
polygons[crop.location.page].append(crop.location.polygon)

images: List[ExtractedImage] = []
for page_index, page_polygons in enumerate(polygons):
if not page_polygons:
continue
images.extend(
extract_multiple_images_from_source(
input_source,
page_index,
page_polygons,
)
)
return images
13 changes: 13 additions & 0 deletions mindee/v2/product/split/split_range.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from typing import List

from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.parsing.common.string_dict import StringDict


Expand All @@ -21,3 +24,13 @@ def __init__(self, server_response: StringDict):
def __str__(self) -> str:
page_range = ",".join([str(page_index) for page_index in self.page_range])
return f"* :Page Range: {page_range}\n :Document Type: {self.document_type}"

def apply_to_file(self, input_source: LocalInputSource) -> ExtractedPdf:
Comment thread
sebastianMindee marked this conversation as resolved.
Outdated
"""
Apply the split range inference to a file and return a single extracted PDF.

:param input_source: Local file to apply the inference to
:return: Extracted PDF
"""
pdf_extractor = PdfExtractor(input_source)
return pdf_extractor.extract_sub_documents([self.page_range])[0]
14 changes: 14 additions & 0 deletions mindee/v2/product/split/split_response.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
from typing import List

from mindee.extraction import ExtractedPdf
from mindee.input.sources.local_input_source import LocalInputSource
from mindee.parsing.common.string_dict import StringDict
from mindee.v2.file_operations.split import Split
from mindee.v2.parsing.inference import BaseResponse
from mindee.v2.product.split.split_inference import SplitInference

Expand All @@ -15,3 +20,12 @@ class SplitResponse(BaseResponse):
def __init__(self, raw_response: StringDict) -> None:
super().__init__(raw_response)
self.inference = SplitInference(raw_response["inference"])

def apply_to_file(self, input_source: LocalInputSource) -> List[ExtractedPdf]:
Comment thread
sebastianMindee marked this conversation as resolved.
Outdated
"""
Apply the split inference to a file and return a list of extracted PDFs.

:param input_source: Local file to apply the inference to
:return: List of extracted PDFs
"""
return Split.extract_splits(input_source, self.inference.result.splits)
Loading
Loading