From 3d100ae7bda5b0f4b804ff2ccb66e538d0a96f78 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Tue, 31 Mar 2026 10:45:40 +0200 Subject: [PATCH] :bug: fix split feature adding additional pages to output files --- mindee/extraction/common/extracted_image.py | 1 - mindee/v2/file_operations/crop_files.py | 7 ++--- mindee/v2/file_operations/split.py | 27 ++++++++++++------- mindee/v2/file_operations/split_files.py | 7 ++--- mindee/v2/product/split/split_range.py | 5 ++-- .../file_operations/test_split_operation.py | 5 ++-- .../test_split_operation_integration.py | 3 +-- 7 files changed, 30 insertions(+), 25 deletions(-) diff --git a/mindee/extraction/common/extracted_image.py b/mindee/extraction/common/extracted_image.py index 66f7631a..65acb81c 100644 --- a/mindee/extraction/common/extracted_image.py +++ b/mindee/extraction/common/extracted_image.py @@ -61,7 +61,6 @@ def save_to_file( if not file_format: if len(resolved_path.suffix) < 1: raise ValueError("Invalid file format.") - # Let PIL infer format from filename extension self.buffer.seek(0) image = Image.open(self.buffer) if file_format: diff --git a/mindee/v2/file_operations/crop_files.py b/mindee/v2/file_operations/crop_files.py index 9887b669..a5e5d131 100644 --- a/mindee/v2/file_operations/crop_files.py +++ b/mindee/v2/file_operations/crop_files.py @@ -7,14 +7,15 @@ class CropFiles(List[ExtractedImage]): """Crop files.""" - def save_all_to_disk(self, path: Union[Path, str]): + def save_all_to_disk(self, path: Union[Path, str], prefix: str = "crop"): """ Save all extracted crops to disk. - :param path: Path to save the extracted splits to + :param path: Path to save the extracted splits to. + :param prefix: Prefix to add to the filename, defaults to 'crop'. """ if isinstance(path, str): path = Path(path) path.mkdir(parents=True, exist_ok=True) for idx, split in enumerate(self, start=1): - split.save_to_file(path / f"crop_{idx:03}.jpg") + split.save_to_file(path / f"{prefix}_{idx:03}.jpg") diff --git a/mindee/v2/file_operations/split.py b/mindee/v2/file_operations/split.py index e528e7b4..68b28145 100644 --- a/mindee/v2/file_operations/split.py +++ b/mindee/v2/file_operations/split.py @@ -1,15 +1,28 @@ from typing import List, Union from mindee.error import MindeeError -from mindee.extraction import PdfExtractor +from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf +from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor from mindee.input.sources.local_input_source import LocalInputSource from mindee.v2.file_operations.split_files import SplitFiles -from mindee.v2.product.split.split_range import SplitRange + + +def extract_single_split( + input_source: LocalInputSource, split: List[int] +) -> ExtractedPdf: + """ + Extracts a single split as a complete PDF from the document. + + :param input_source: Input source to split. + :param split: List of pages to keep. + :return: Extracted PDF + """ + return extract_splits(input_source, [split])[0] def extract_splits( input_source: LocalInputSource, - splits: Union[List[SplitRange], List[List[int]]], + splits: Union[List[List[int]]], ) -> SplitFiles: """ Extracts splits as complete PDFs from the document. @@ -21,13 +34,7 @@ def extract_splits( pdf_extractor = PdfExtractor(input_source) page_groups = [] for split in splits: - if isinstance(split, SplitRange): - lower_bound = split.page_range[0] - upper_bound = split.page_range[1] - else: - lower_bound = split[0] - upper_bound = split[1] - page_groups.append(list(range(lower_bound, upper_bound + 1))) + page_groups.append(list(range(split[0], split[1] + 1))) if len(splits) < 1: raise MindeeError("No indexes provided.") return SplitFiles(pdf_extractor.extract_sub_documents(page_groups)) diff --git a/mindee/v2/file_operations/split_files.py b/mindee/v2/file_operations/split_files.py index 8368ecb2..fe451a77 100644 --- a/mindee/v2/file_operations/split_files.py +++ b/mindee/v2/file_operations/split_files.py @@ -7,14 +7,15 @@ class SplitFiles(List[ExtractedPdf]): """Split files.""" - def save_all_to_disk(self, path: Union[str, Path]): + def save_all_to_disk(self, path: Union[str, Path], prefix: str = "split"): """ Save all extracted splits to disk. - :param path: Path to save the extracted splits to + :param path: Path to save the extracted splits to. + :param prefix: Prefix to add to the filename, defaults to 'split'. """ if isinstance(path, str): path = Path(path) path.mkdir(parents=True, exist_ok=True) for idx, split in enumerate(self, start=1): - split.save_to_file(path / f"split_{idx:03}.pdf") + split.save_to_file(path / f"{prefix}_{idx:03}.pdf") diff --git a/mindee/v2/product/split/split_range.py b/mindee/v2/product/split/split_range.py index 9888a930..2a429b36 100644 --- a/mindee/v2/product/split/split_range.py +++ b/mindee/v2/product/split/split_range.py @@ -1,9 +1,9 @@ from typing import List from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf -from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor from mindee.input.sources.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict +from mindee.v2.file_operations.split import extract_single_split class SplitRange: @@ -32,5 +32,4 @@ def extract_from_file(self, input_source: LocalInputSource) -> ExtractedPdf: :param input_source: Local file to apply the inference to :return: Extracted PDF """ - pdf_extractor = PdfExtractor(input_source) - return pdf_extractor.extract_sub_documents([self.page_range])[0] + return extract_single_split(input_source, self.page_range) diff --git a/tests/v2/file_operations/test_split_operation.py b/tests/v2/file_operations/test_split_operation.py index 8e70b5ac..60e481d4 100644 --- a/tests/v2/file_operations/test_split_operation.py +++ b/tests/v2/file_operations/test_split_operation.py @@ -2,7 +2,6 @@ import pytest -from mindee.v2.file_operations.split import extract_splits from mindee.input.sources.path_input import PathInput from mindee.v2.product.split.split_response import ( SplitResponse, @@ -37,7 +36,7 @@ def test_single_page_split_split(splits_default, splits_single_page_json_path): with open(splits_single_page_json_path, "rb") as f: response = json.load(f) doc = SplitResponse(response) - extracted_splits = extract_splits(input_sample, doc.inference.result.splits) + extracted_splits = doc.extract_from_file(input_sample) assert len(extracted_splits) == 1 assert extracted_splits[0].get_page_count() == 1 @@ -48,7 +47,7 @@ def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path): with open(splits_multi_page_json_path, "rb") as f: response = json.load(f) doc = SplitResponse(response) - extracted_splits = extract_splits(input_sample, doc.inference.result.splits) + extracted_splits = doc.extract_from_file(input_sample) assert len(extracted_splits) == 3 assert extracted_splits[0].get_page_count() == 1 diff --git a/tests/v2/file_operations/test_split_operation_integration.py b/tests/v2/file_operations/test_split_operation_integration.py index 9fba4ad9..edc319fa 100644 --- a/tests/v2/file_operations/test_split_operation_integration.py +++ b/tests/v2/file_operations/test_split_operation_integration.py @@ -10,7 +10,6 @@ SplitResponse, ) from mindee.input.sources.path_input import PathInput -from mindee.v2.file_operations.split import extract_splits from tests.utils import OUTPUT_DIR, V2_PRODUCT_DATA_DIR, cleanup_output_files @@ -38,7 +37,7 @@ def test_pdf_should_extract_splits(): ) assert response.inference.file.page_count == 2 - extracted_pdfs = extract_splits(split_input, response.inference.result.splits) + extracted_pdfs = response.extract_from_file(split_input) assert len(extracted_pdfs) == 2 assert extracted_pdfs[0].filename == "default_sample_001-001.pdf"