Skip to content

Commit 749a5ac

Browse files
author
Jonas Widmer
committed
linting and overwrite refactoring, started segmentation implementation
1 parent fcb871a commit 749a5ac

9 files changed

Lines changed: 365 additions & 193 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ cython_debug/
165165
.DS_Store
166166
.AppleDouble
167167
.LSOverride
168+
qodana.yaml
168169

169170
# Icon must end with two \r
170171
Icon
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,19 @@
11
class ImageFetchException(Exception):
2+
"""
3+
Exception raised when an image cannot be fetched
4+
"""
25
pass
36

47

58
class ImageProcessException(Exception):
9+
"""
10+
Exception raised when an image cannot be processed
11+
"""
612
pass
713

814

915
class ParseTextLinesException(Exception):
16+
"""
17+
Exception raised when an error occurs while parsing
18+
"""
1019
pass

flow_preprocessor/preprocessing_logic/fetch_images.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
"""
2+
Defining the ImageDownloader class to manage the image fetching.
3+
"""
4+
15
# ===============================================================================
26
# IMPORT STATEMENTS
37
# ===============================================================================
@@ -42,11 +46,16 @@ def _request_image_via_url(self, url: str, filename: str) -> None:
4246
:param url: the image url
4347
:param filename: the image filename
4448
"""
45-
response = requests.get(url)
46-
response.raise_for_status()
47-
with open(filename, 'wb') as file:
48-
file.write(response.content)
49-
logger.info(f'{self.__class__.__name__} - File downloaded: {filename}')
49+
try:
50+
response = requests.get(url, timeout=20)
51+
response.raise_for_status()
52+
with open(filename, 'wb') as file:
53+
file.write(response.content)
54+
logger.info('%s - File downloaded: %s', self.__class__.__name__, filename)
55+
except requests.exceptions.Timeout:
56+
logger.info('%s - Image download timed out', self.__class__.__name__)
57+
except requests.exceptions.RequestException as e:
58+
logger.info('%s - Image download failed: %s', self.__class__.__name__, e)
5059

5160
def fetch_image(self, page: Page, img_output: str) -> None:
5261
"""
@@ -68,26 +77,32 @@ def fetch_image(self, page: Page, img_output: str) -> None:
6877
except requests.exceptions.RequestException as e:
6978
self.failed_downloads.append(image_filename)
7079
logger.error(
71-
f'{self.__class__.__name__} - Failed to download file {image_filename}',
80+
'%s - Failed to download file %s',
81+
self.__class__.__name__,
82+
image_filename,
7283
exc_info=True
7384
)
74-
raise ImageFetchException('Failed to download file %s.', e)
85+
raise ImageFetchException(f'Failed to download file {e}.') from e
7586
except (et.XMLSyntaxError, et.ParseError, IndexError, TypeError, ValueError) as e:
7687
if image_filename is not None:
7788
self.failed_processing.append(image_filename)
7889
logger.error(
79-
f'{self.__class__.__name__} - Error parsing file {page}',
90+
'%s - Error parsing file %s',
91+
self.__class__.__name__,
92+
page,
8093
exc_info=True
8194
)
82-
raise ImageProcessException('Error parsing file %s: %s', e)
95+
raise ImageProcessException(f'Error parsing file {e}') from e
8396
except Exception as e:
8497
if image_filename is not None:
8598
self.failed_processing.append(image_filename)
8699
logger.error(
87-
f'{self.__class__.__name__} - An unexpected error occurred for file {page}',
100+
'%s - An unexpected error occurred for file %s',
101+
self.__class__.__name__,
102+
page,
88103
exc_info=True
89104
)
90-
raise Exception('An unexpected error occurred for file %s: %s', e)
105+
raise RuntimeError(f'An unexpected error occurred for file {image_filename}: {e}') from e
91106

92107
def get_failed_downloads(self) -> List[str]:
93108
"""

flow_preprocessor/preprocessing_logic/models.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,26 @@
1+
"""
2+
Models to use for the preprocessing package - mainly the status model
3+
"""
14
from datetime import datetime
2-
3-
from pydantic import BaseModel, Field
45
from typing import Optional, List
56
import enum
67

8+
from pydantic import BaseModel, Field
9+
710

811
class StateEnum(enum.Enum):
12+
"""
13+
List of states of the process
14+
"""
915
IN_PROGRESS = "in_progress"
1016
FAILED = "failed"
1117
DONE = "done"
1218

1319

1420
class PreprocessState(BaseModel):
21+
"""
22+
The state of a preprocessing job
23+
"""
1524
process_id: str = Field(alias="process_id",
1625
description="The uniqueid of the preprocess status.",
1726
title="ID")
@@ -38,21 +47,6 @@ class PreprocessState(BaseModel):
3847
alias="stop_on_fail",
3948
description="Whether to stop processing on failure.",
4049
title="Stop-On-Fail")
41-
directory: str = Field(default="tmp",
42-
alias="directory",
43-
description="Directory to save the files temporarily to.",
44-
title="Directory",
45-
examples=["tmp"])
46-
in_path: str = Field(default="fetched",
47-
alias="in_path",
48-
description="Path to save the fetched files.",
49-
title="In-Path",
50-
examples=["fetched"])
51-
out_path: str = Field(default="preprocessed",
52-
alias="out_path",
53-
description="Path to save the preprocessed files.",
54-
title="Out-Path",
55-
examples=["preprocessed"])
5650
progress: int = Field(alias="progress",
5751
description="The progress of the preprocess status.",
5852
title="Progress",
@@ -89,7 +83,16 @@ class PreprocessState(BaseModel):
8983
description="The names of the files that failed downloading.",
9084
title="Filenames-Failed-Download",
9185
default=[])
86+
line_images: Optional[List] = Field(alias="line_images",
87+
description="The names of the lines images processed.",
88+
title="Filenames-Line-Images-Processed",
89+
default=[])
9290
runtime: Optional[int] = Field(alias="runtime",
9391
description="The runtime of the preprocess status.",
9492
title="Runtime",
9593
default=0)
94+
segment: Optional[bool] = Field(alias="segment",
95+
description="Whether the images have to be segmented before processing.",
96+
title="Segment",
97+
default=False
98+
)

0 commit comments

Comments
 (0)