Skip to content

Commit 2da00a3

Browse files
Lidang-JiangclaudeArthurZucker
authored
[Bugfix] Remove incorrect torchvision requirement from PIL backend image processors (#45045)
* [Bugfix] Remove incorrect torchvision requirement from PIL backend image processors PR #45029 added @requires(backends=("vision", "torch", "torchvision")) to 67 PIL backend image_processing_pil_*.py files. This causes PIL backend classes to become dummy objects when torchvision is not installed, making AutoImageProcessor unable to find any working processor. Fix: set @requires to ("vision",) for files that only need PIL, and ("vision", "torch") for files that also use torch directly. Also fix 5 modular source files so make fix-repo preserves the correct backends. Fixes #45042 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * [Bugfix] Remove redundant @requires(backends=("vision",)) from PIL backends Per reviewer feedback: the vision-only @requires decorator is redundant for PIL backend classes since PilBackend base class already handles this. - Remove @requires(backends=("vision",)) from 43 PIL backend files - Remove unused `requires` import from 38 files (Category A) - Keep @requires(backends=("vision", "torch")) on method-level decorators (Category B: 5 files) * update * remove torch when its not necessary * remove if typechecking * fix import shinanigans * marvellous that's how we protect torch :) * beit is torchvisionbackend * more import cleanup * fiixup * fix-repo * update * style * fixes * up * more * fix repo * up * update * fix imports * style * fix check copies * arf * converter up * fix? * fix copies * fix for func * style * ignore * type --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Arthur <arthur.zucker@gmail.com>
1 parent 2dbee5a commit 2da00a3

188 files changed

Lines changed: 5095 additions & 1270 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/transformers/image_processing_backends.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,7 @@ def resize(
529529
self,
530530
image: np.ndarray,
531531
size: SizeDict,
532-
resample: Union["PILImageResampling", "tvF.InterpolationMode", int] | None = None,
532+
resample: "PILImageResampling | None" = None,
533533
reducing_gap: int | None = None,
534534
**kwargs,
535535
) -> np.ndarray:
@@ -628,7 +628,7 @@ def _preprocess(
628628
images: list[np.ndarray],
629629
do_resize: bool,
630630
size: SizeDict,
631-
resample: Union["PILImageResampling", "tvF.InterpolationMode", int] | None,
631+
resample: "PILImageResampling | None",
632632
do_center_crop: bool,
633633
crop_size: SizeDict,
634634
do_rescale: bool,

src/transformers/models/aria/image_processing_aria.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,14 @@
1818
# See the License for the specific language governing permissions and
1919
# limitations under the License.
2020
import torch
21+
from torchvision.transforms.v2 import functional as tvF
2122

2223
from ...image_processing_backends import TorchvisionBackend
2324
from ...image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution
2425
from ...image_transforms import divide_to_patches
2526
from ...image_utils import ChannelDimension, PILImageResampling, SizeDict, get_image_size
2627
from ...processing_utils import ImagesKwargs, Unpack
27-
from ...utils import TensorType, auto_docstring, is_torchvision_available
28-
29-
30-
if is_torchvision_available():
31-
from torchvision.transforms.v2 import functional as tvF
28+
from ...utils import TensorType, auto_docstring
3229

3330

3431
class AriaImageProcessorKwargs(ImagesKwargs, total=False):

src/transformers/models/aria/image_processing_pil_aria.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,29 @@
2424
SizeDict,
2525
get_image_size,
2626
)
27-
from ...processing_utils import Unpack
27+
from ...processing_utils import ImagesKwargs, Unpack
2828
from ...utils import TensorType, auto_docstring
29-
from ...utils.import_utils import requires
30-
from .image_processing_aria import AriaImageProcessorKwargs
3129

3230

33-
@requires(backends=("vision", "torch", "torchvision"))
31+
# Adapted from transformers.models.aria.image_processing_aria.AriaImageProcessorKwargs
32+
class AriaImageProcessorKwargs(ImagesKwargs, total=False):
33+
r"""
34+
max_image_size (`int`, *optional*, defaults to `self.max_image_size`):
35+
Maximum image size. Must be either 490 or 980.
36+
min_image_size (`int`, *optional*, defaults to `self.min_image_size`):
37+
Minimum image size. Images smaller than this in any dimension will be scaled up.
38+
split_resolutions (`list[list[int]]`, *optional*, defaults to `self.split_resolutions`):
39+
A list of possible resolutions as (height, width) pairs for splitting high-resolution images into patches.
40+
split_image (`bool`, *optional*, defaults to `self.split_image`):
41+
Whether to split the image into patches using the best matching resolution from `split_resolutions`.
42+
"""
43+
44+
max_image_size: int
45+
min_image_size: int
46+
split_resolutions: list[list[int]]
47+
split_image: bool
48+
49+
3450
@auto_docstring
3551
class AriaImageProcessorPil(PilBackend):
3652
model_input_names = ["pixel_values", "pixel_mask", "num_crops"]
@@ -65,7 +81,7 @@ def _resize_for_patching(
6581
self,
6682
image: np.ndarray,
6783
target_resolution: tuple,
68-
resample: "PILImageResampling | int | None",
84+
resample: "PILImageResampling | None",
6985
) -> np.ndarray:
7086
"""Resize an image to a target resolution while maintaining aspect ratio."""
7187
new_height, new_width = get_patch_output_size(
@@ -90,7 +106,7 @@ def get_image_patches(
90106
image: np.ndarray,
91107
grid_pinpoints: list[list[int]],
92108
patch_size: int,
93-
resample: "PILImageResampling | int | None",
109+
resample: "PILImageResampling | None",
94110
) -> list[np.ndarray]:
95111
"""
96112
Process an image with variable resolutions by dividing it into patches.
@@ -131,7 +147,7 @@ def _preprocess(
131147
min_image_size: int = 336,
132148
split_resolutions: list[list[int]] | None = None,
133149
split_image: bool = False,
134-
resample: "PILImageResampling | int | None" = None,
150+
resample: "PILImageResampling | None" = None,
135151
**kwargs,
136152
) -> BatchFeature:
137153
if max_image_size not in [490, 980]:

src/transformers/models/aria/modeling_aria.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,7 @@
4141
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
4242
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
4343
from ...processing_utils import Unpack
44-
from ...utils import (
45-
TransformersKwargs,
46-
auto_docstring,
47-
can_return_tuple,
48-
torch_compilable_check,
49-
)
44+
from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, torch_compilable_check
5045
from ...utils.generic import maybe_autocast, merge_with_config_defaults
5146
from ...utils.output_capturing import capture_outputs
5247
from ..auto import AutoModel

src/transformers/models/aria/modular_aria.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import torch
1515
from huggingface_hub.dataclasses import strict
1616
from torch import nn
17+
from torchvision.transforms.v2 import functional as tvF
1718

1819
from ... import initialization as init
1920
from ...activations import ACT2FN
@@ -39,8 +40,6 @@
3940
TransformersKwargs,
4041
auto_docstring,
4142
can_return_tuple,
42-
is_torch_available,
43-
is_torchvision_available,
4443
logging,
4544
)
4645
from ..auto import CONFIG_MAPPING, AutoConfig, AutoTokenizer
@@ -323,13 +322,6 @@ def forward(self, key_value_states: torch.Tensor, attn_mask: torch.Tensor | None
323322
return out
324323

325324

326-
if is_torch_available():
327-
import torch
328-
329-
if is_torchvision_available():
330-
from torchvision.transforms.v2 import functional as tvF
331-
332-
333325
class AriaImageProcessorKwargs(ImagesKwargs, total=False):
334326
r"""
335327
max_image_size (`int`, *optional*, defaults to `self.max_image_size`):

src/transformers/models/beit/image_processing_beit.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515

1616
from typing import Union
1717

18+
import torch
19+
import torch.nn.functional as F
20+
from torchvision.transforms.v2 import functional as tvF
21+
1822
from ...image_processing_backends import TorchvisionBackend
1923
from ...image_processing_utils import BatchFeature
2024
from ...image_transforms import group_images_by_shape, reorder_images
@@ -27,15 +31,7 @@
2731
SizeDict,
2832
)
2933
from ...processing_utils import ImagesKwargs, Unpack
30-
from ...utils import TensorType, auto_docstring, is_torch_available, is_torchvision_available
31-
32-
33-
if is_torch_available():
34-
import torch
35-
import torch.nn.functional as F
36-
37-
if is_torchvision_available():
38-
from torchvision.transforms.v2 import functional as tvF
34+
from ...utils import TensorType, auto_docstring, is_torch_available
3935

4036

4137
class BeitImageProcessorKwargs(ImagesKwargs, total=False):

src/transformers/models/beit/image_processing_pil_beit.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,6 @@
1414
"""Image processor class for BEiT."""
1515

1616
import numpy as np
17-
import torch
18-
import torch.nn.functional as F
19-
from torchvision.transforms.v2 import functional as tvF
2017

2118
from ...image_processing_backends import PilBackend
2219
from ...image_processing_utils import BatchFeature
@@ -28,14 +25,24 @@
2825
PILImageResampling,
2926
SizeDict,
3027
)
31-
from ...processing_utils import Unpack
32-
from ...utils import TensorType, auto_docstring, is_torch_available
28+
from ...processing_utils import ImagesKwargs, Unpack
29+
from ...utils import TensorType, auto_docstring
3330
from ...utils.import_utils import requires
34-
from .image_processing_beit import BeitImageProcessorKwargs
31+
32+
33+
# Adapted from transformers.models.beit.image_processing_beit.BeitImageProcessorKwargs
34+
class BeitImageProcessorKwargs(ImagesKwargs, total=False):
35+
r"""
36+
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
37+
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
38+
is used for background, and background itself is not included in all classes of a dataset (e.g.
39+
ADE20k). The background label will be replaced by 255.
40+
"""
41+
42+
do_reduce_labels: bool
3543

3644

3745
@auto_docstring
38-
@requires(backends=("vision", "torch", "torchvision"))
3946
class BeitImageProcessorPil(PilBackend):
4047
"""PIL backend for BEiT with reduce_label support."""
4148

@@ -124,7 +131,7 @@ def _preprocess(
124131
images: list[np.ndarray],
125132
do_resize: bool,
126133
size: SizeDict,
127-
resample: "PILImageResampling | tvF.InterpolationMode | int | None",
134+
resample: PILImageResampling | None,
128135
do_center_crop: bool,
129136
crop_size: SizeDict,
130137
do_rescale: bool,
@@ -152,6 +159,7 @@ def _preprocess(
152159

153160
return processed_images
154161

162+
@requires(backends=("torch",))
155163
def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple] | None = None):
156164
"""
157165
Converts the output of [`BeitForSemanticSegmentation`] into semantic segmentation maps.
@@ -168,8 +176,8 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: list[tuple]
168176
segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
169177
specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
170178
"""
171-
if not is_torch_available():
172-
raise ImportError("PyTorch is required for post_process_semantic_segmentation")
179+
import torch
180+
import torch.nn.functional as F
173181

174182
logits = outputs.logits
175183

src/transformers/models/bridgetower/image_processing_bridgetower.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
from typing import Union
1717

1818
import numpy as np
19+
import torch
20+
from torchvision.transforms.v2 import functional as tvF
1921

2022
from ...image_processing_backends import TorchvisionBackend
2123
from ...image_processing_utils import BatchFeature
@@ -29,14 +31,7 @@
2931
get_image_size,
3032
)
3133
from ...processing_utils import ImagesKwargs, Unpack
32-
from ...utils import TensorType, auto_docstring, is_torch_available, is_torchvision_available
33-
34-
35-
if is_torch_available():
36-
import torch
37-
38-
if is_torchvision_available():
39-
from torchvision.transforms.v2 import functional as tvF
34+
from ...utils import TensorType, auto_docstring, is_torch_available
4035

4136

4237
def get_resize_output_image_size(

src/transformers/models/bridgetower/image_processing_pil_bridgetower.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,57 @@
2020
from ...image_utils import (
2121
OPENAI_CLIP_MEAN,
2222
OPENAI_CLIP_STD,
23+
ChannelDimension,
2324
PILImageResampling,
2425
SizeDict,
26+
get_image_size,
2527
)
26-
from ...processing_utils import Unpack
28+
from ...processing_utils import ImagesKwargs, Unpack
2729
from ...utils import TensorType, auto_docstring
28-
from ...utils.import_utils import requires
29-
from .image_processing_bridgetower import BridgeTowerImageProcessorKwargs, get_resize_output_image_size
3030

3131

32-
@requires(backends=("vision", "torch", "torchvision"))
32+
# Adapted from transformers.models.bridgetower.image_processing_bridgetower.BridgeTowerImageProcessorKwargs
33+
class BridgeTowerImageProcessorKwargs(ImagesKwargs, total=False):
34+
r"""
35+
size_divisor (`int`, *optional*, defaults to `self.size_divisor`):
36+
The size by which to make sure both the height and width can be divided.
37+
"""
38+
39+
size_divisor: int
40+
41+
42+
# adapted from transformers.models.bridgetower.image_processing_bridgetower.get_resize_output_image_size
43+
def get_resize_output_image_size(
44+
input_image: np.ndarray,
45+
shorter: int = 800,
46+
longer: int = 1333,
47+
size_divisor: int = 32,
48+
) -> tuple[int, int]:
49+
"""Get output image size after resizing with size_divisor."""
50+
input_height, input_width = get_image_size(input_image, channel_dim=ChannelDimension.FIRST)
51+
52+
min_size, max_size = shorter, longer
53+
scale = min_size / min(input_height, input_width)
54+
55+
if input_height < input_width:
56+
new_height = min_size
57+
new_width = scale * input_width
58+
else:
59+
new_height = scale * input_height
60+
new_width = min_size
61+
62+
if max(new_height, new_width) > max_size:
63+
scale = max_size / max(new_height, new_width)
64+
new_height = scale * new_height
65+
new_width = scale * new_width
66+
67+
new_height, new_width = int(new_height + 0.5), int(new_width + 0.5)
68+
new_height = new_height // size_divisor * size_divisor
69+
new_width = new_width // size_divisor * size_divisor
70+
71+
return new_height, new_width
72+
73+
3374
@auto_docstring
3475
class BridgeTowerImageProcessorPil(PilBackend):
3576
"""PIL backend for BridgeTower with custom resize and center_crop."""
@@ -57,7 +98,7 @@ def resize(
5798
self,
5899
image: np.ndarray,
59100
size: SizeDict,
60-
resample: "PILImageResampling | int | None",
101+
resample: "PILImageResampling | None",
61102
size_divisor: int = 32,
62103
**kwargs,
63104
) -> np.ndarray:
@@ -82,7 +123,7 @@ def _preprocess(
82123
images: list[np.ndarray],
83124
do_resize: bool,
84125
size: SizeDict,
85-
resample: "PILImageResampling | int | None",
126+
resample: "PILImageResampling | None",
86127
do_center_crop: bool,
87128
crop_size: SizeDict,
88129
do_rescale: bool,

src/transformers/models/chameleon/image_processing_chameleon.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
import numpy as np
1717
import PIL.Image
18+
import torch
19+
from torchvision.transforms.v2 import functional as tvF
1820

1921
from ...image_processing_backends import TorchvisionBackend
2022
from ...image_utils import (
@@ -23,15 +25,9 @@
2325
SizeDict,
2426
)
2527
from ...processing_utils import ImagesKwargs, Unpack
26-
from ...utils import auto_docstring, is_torch_available, is_torchvision_available, logging
28+
from ...utils import auto_docstring, logging
2729

2830

29-
if is_torch_available():
30-
import torch
31-
32-
if is_torchvision_available():
33-
from torchvision.transforms.v2 import functional as tvF
34-
3531
logger = logging.get_logger(__name__)
3632

3733

0 commit comments

Comments
 (0)