Skip to content

Commit 2e61721

Browse files
[Fixes #13936] Support for XLSX File Uploads in GeoNode (#13937)
* xlsx handler --------- Co-authored-by: Mattia Giupponi <mattia.giupponi@gmail.com>
1 parent fae6656 commit 2e61721

17 files changed

Lines changed: 521 additions & 2 deletions

.env.sample

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,4 +245,7 @@ RESTART_POLICY_WINDOW=120s
245245

246246
DEFAULT_MAX_PARALLEL_UPLOADS_PER_USER=5
247247

248-
# FORCE_READ_ONLY_MODE=False Override the read-only value saved in the configuration
248+
# FORCE_READ_ONLY_MODE=False Override the read-only value saved in the configuration
249+
250+
# Enable or not the XLSX / XLS upload
251+
XLSX_UPLOAD_ENABLED=False

.env_dev

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,4 +207,7 @@ RESTART_POLICY_WINDOW=120s
207207

208208
DEFAULT_MAX_PARALLEL_UPLOADS_PER_USER=5
209209
UPSERT_CHUNK_SIZE= 100
210-
UPSERT_LIMIT_ERROR_LOG=100
210+
UPSERT_LIMIT_ERROR_LOG=100
211+
212+
# Enable or not the XLSX / XLS upload
213+
XLSX_UPLOAD_ENABLED=False

.env_local

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,3 +209,6 @@ RESTART_POLICY_MAX_ATTEMPTS="3"
209209
RESTART_POLICY_WINDOW=120s
210210

211211
DEFAULT_MAX_PARALLEL_UPLOADS_PER_USER=5
212+
213+
# Enable or not the XLSX / XLS upload
214+
XLSX_UPLOAD_ENABLED=False

.env_test

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,3 +224,6 @@ MICROSOFT_TENANT_ID=
224224
AZURE_CLIENT_ID=
225225
AZURE_SECRET_KEY=
226226
AZURE_KEY=
227+
228+
# Enable or not the XLSX / XLS upload
229+
XLSX_UPLOAD_ENABLED=False

geonode/settings.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2236,3 +2236,6 @@ def get_geonode_catalogue_service():
22362236

22372237
FILE_UPLOAD_DIRECTORY_PERMISSIONS = 0o777
22382238
FILE_UPLOAD_PERMISSIONS = 0o777
2239+
2240+
# Enable or not the XLSX / XLS upload
2241+
XLSX_UPLOAD_ENABLED = ast.literal_eval(os.getenv("XLSX_UPLOAD_ENABLED", "False"))

geonode/upload/handlers/xlsx/__init__.py

Whitespace-only changes.
Lines changed: 336 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
#########################################################################
2+
#
3+
# Copyright (C) 2024 OSGeo
4+
#
5+
# This program is free software: you can redistribute it and/or modify
6+
# it under the terms of the GNU General Public License as published by
7+
# the Free Software Foundation, either version 3 of the License, or
8+
# (at your option) any later version.
9+
#
10+
# This program is distributed in the hope that it will be useful,
11+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
# GNU General Public License for more details.
14+
#
15+
# You should have received a copy of the GNU General Public License
16+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
17+
#
18+
#########################################################################
19+
import logging
20+
from pathlib import Path
21+
import csv
22+
from datetime import datetime
23+
import math
24+
from celery import group
25+
from python_calamine import CalamineWorkbook
26+
from osgeo import ogr
27+
28+
from dynamic_models.models import ModelSchema
29+
from django.conf import settings
30+
31+
from geonode.upload.handlers.common.vector import BaseVectorFileHandler
32+
from geonode.upload.handlers.csv.handler import CSVFileHandler
33+
from geonode.upload.celery_tasks import create_dynamic_structure
34+
from geonode.upload.handlers.utils import GEOM_TYPE_MAPPING
35+
from geonode.upload.api.exceptions import InvalidInputFileException
36+
37+
logger = logging.getLogger("importer")
38+
39+
40+
class XLSXFileHandler(CSVFileHandler):
41+
42+
XLSX_UPLOAD_ENABLED = getattr(settings, "XLSX_UPLOAD_ENABLED", False)
43+
44+
lat_names = CSVFileHandler.possible_lat_column
45+
lon_names = CSVFileHandler.possible_long_column
46+
47+
@classmethod
48+
def is_xlsx_enabled(cls):
49+
"""
50+
Unified check for the feature toggle.
51+
Returns True if enabled, None if disabled.
52+
"""
53+
if not cls.XLSX_UPLOAD_ENABLED:
54+
return None
55+
return True
56+
57+
@property
58+
def supported_file_extension_config(self):
59+
60+
# If disabled, return an empty list or None so the UI doesn't show XLSX options
61+
if not self.is_xlsx_enabled():
62+
return None
63+
64+
return {
65+
"id": "excel", # Use a generic ID that doesn't imply a specific extension
66+
"formats": [
67+
{
68+
"label": "Excel (xlsx)",
69+
"required_ext": ["xlsx"],
70+
"optional_ext": ["sld", "xml"],
71+
},
72+
{
73+
"label": "Excel (xls)",
74+
"required_ext": ["xls"],
75+
"optional_ext": ["sld", "xml"],
76+
},
77+
],
78+
"actions": list(self.TASKS.keys()),
79+
"type": "vector",
80+
}
81+
82+
@staticmethod
83+
def can_handle(_data) -> bool:
84+
"""
85+
This endpoint will return True or False if with the info provided
86+
the handler is able to handle the file or not
87+
"""
88+
# Availability Check for the back-end
89+
if not XLSXFileHandler.is_xlsx_enabled():
90+
return False
91+
92+
base = _data.get("base_file")
93+
if not base:
94+
return False
95+
96+
# Support both XLSX and XLS
97+
valid_extensions = (".xlsx", ".xls")
98+
99+
is_excel = (
100+
base.lower().endswith(valid_extensions)
101+
if isinstance(base, str)
102+
else base.name.lower().endswith(valid_extensions)
103+
)
104+
105+
return is_excel and BaseVectorFileHandler.can_handle(_data)
106+
107+
@staticmethod
108+
def is_valid(files, user, **kwargs):
109+
from geonode.upload.utils import UploadLimitValidator
110+
111+
# Basic GeoNode validation
112+
BaseVectorFileHandler.is_valid(files, user)
113+
114+
# Parallelism check (This is fast and doesn't need to open the file)
115+
upload_validator = UploadLimitValidator(user)
116+
upload_validator.validate_parallelism_limit_per_user()
117+
118+
# We handle the deep inspection (lat/lon) later.
119+
return True
120+
121+
@staticmethod
122+
def create_ogr2ogr_command(files, original_name, ovverwrite_layer, alternate, **kwargs):
123+
"""
124+
Customized for XLSX: Only looks for X/Y (Point) data.
125+
Sanitized with shlex.quote to prevent Command Injection.
126+
"""
127+
128+
# Pass the safe versions to the base handler
129+
base_command = BaseVectorFileHandler.create_ogr2ogr_command(files, original_name, ovverwrite_layer, alternate)
130+
131+
# Define mapping (these are safe as they are class-level constants)
132+
lat_mapping = ",".join(XLSXFileHandler.lat_names)
133+
lon_mapping = ",".join(XLSXFileHandler.lon_names)
134+
135+
additional_option = (
136+
f' -oo "X_POSSIBLE_NAMES={lon_mapping}" ' f'-oo "Y_POSSIBLE_NAMES={lat_mapping}" ' f'-nln "{alternate}"'
137+
)
138+
139+
# Return the combined, safe command string
140+
return (
141+
f"{base_command} -oo KEEP_GEOM_COLUMNS=NO "
142+
f"-lco GEOMETRY_NAME={BaseVectorFileHandler().default_geometry_column_name} "
143+
f"{additional_option}"
144+
)
145+
146+
def create_dynamic_model_fields(
147+
self,
148+
layer: str,
149+
dynamic_model_schema: ModelSchema = None,
150+
overwrite: bool = None,
151+
execution_id: str = None,
152+
layer_name: str = None,
153+
return_celery_group: bool = True,
154+
):
155+
# retrieving the field schema from ogr2ogr and converting the type to Django Types
156+
layer_schema = [{"name": x.name.lower(), "class_name": self._get_type(x), "null": True} for x in layer.schema]
157+
158+
class_name = GEOM_TYPE_MAPPING.get(self.promote_to_multi("Point"))
159+
# Get the geometry type name from OGR (e.g., 'Point' or 'Point 25D')
160+
geom_type_name = ogr.GeometryTypeToName(layer.GetGeomType())
161+
162+
layer_schema += [
163+
{
164+
"name": layer.GetGeometryColumn() or self.default_geometry_column_name,
165+
"class_name": class_name,
166+
"dim": (3 if geom_type_name.lower().startswith("3d") or "z" in geom_type_name.lower() else 2),
167+
}
168+
]
169+
170+
if not return_celery_group:
171+
return layer_schema
172+
173+
list_chunked = [layer_schema[i : i + 30] for i in range(0, len(layer_schema), 30)]
174+
celery_group = group(
175+
create_dynamic_structure.s(execution_id, schema, dynamic_model_schema.id, overwrite, layer_name)
176+
for schema in list_chunked
177+
)
178+
179+
return dynamic_model_schema, celery_group
180+
181+
def pre_processing(self, files, execution_id, **kwargs):
182+
from geonode.upload.orchestrator import orchestrator
183+
184+
# calling the super function (CSVFileHandler logic)
185+
_data, execution_id = super().pre_processing(files, execution_id, **kwargs)
186+
187+
# convert the XLSX file into a CSV
188+
xlsx_file = _data.get("files", {}).get("base_file", "")
189+
if not xlsx_file:
190+
raise InvalidInputFileException(detail="The base file was not found in the upload payload.")
191+
192+
output_file = str(Path(xlsx_file).with_suffix(".csv"))
193+
194+
try:
195+
workbook = CalamineWorkbook.from_path(xlsx_file)
196+
197+
# Sheet Validation (Uses the validated sheet name)
198+
sheet_name = self._validate_sheets(workbook)
199+
sheet = workbook.get_sheet_by_name(sheet_name)
200+
201+
# We iterate until we find the first non-empty row
202+
rows_gen = iter(sheet.to_python())
203+
try:
204+
# We strictly take the first row. No skipping allowed.
205+
headers = next(rows_gen)
206+
except StopIteration:
207+
raise InvalidInputFileException(detail="The file is empty.")
208+
209+
# Restrictive File Structure Validation
210+
self._validate_headers(headers)
211+
212+
# Conversion with row cleanup
213+
# Note: rows_gen continues from the row after the headers
214+
self._convert_to_csv(headers, rows_gen, output_file)
215+
216+
except Exception as e:
217+
logger.exception("XLSX Pre-processing failed")
218+
raise InvalidInputFileException(detail=f"Failed to securely parse Excel: {str(e)}")
219+
220+
# update the file path in the payload
221+
_data["files"]["base_file"] = output_file
222+
223+
if "temporary_files" not in _data or not isinstance(_data["temporary_files"], dict):
224+
_data["temporary_files"] = {}
225+
226+
_data["temporary_files"]["base_file"] = output_file
227+
228+
# updating the execution id params
229+
orchestrator.update_execution_request_obj(
230+
orchestrator.get_execution_object(execution_id), {"input_params": _data}
231+
)
232+
return _data, execution_id
233+
234+
def _validate_sheets(self, workbook):
235+
"""Returns the first sheet name and logs warnings if others exist."""
236+
sheets = workbook.sheet_names
237+
if not sheets:
238+
raise InvalidInputFileException(detail="No sheets found in workbook.")
239+
if len(sheets) > 1:
240+
logger.warning(f"Multiple sheets found. Ignoring: {sheets[1:]}")
241+
return sheets[0]
242+
243+
def _validate_headers(self, headers):
244+
"""
245+
Strictly validates Row 1 for headers:
246+
- Must not be empty.
247+
- Must contain geometry 'fingerprints' (Lat/Lon).
248+
- Must have unique and non-empty column names.
249+
"""
250+
# Existence Check
251+
if not headers or self._detect_empty_rows(headers):
252+
raise InvalidInputFileException(detail="No data or headers found in the selected sheet.")
253+
254+
# Normalization
255+
clean_headers = [str(h).strip().lower() if h is not None else "" for h in headers]
256+
257+
# Geometry Fingerprint Check
258+
has_lat = any(h in self.lat_names for h in clean_headers)
259+
has_lon = any(h in self.lon_names for h in clean_headers)
260+
261+
if not (has_lat and has_lon):
262+
raise InvalidInputFileException(
263+
detail="The headers do not contain valid geometry headers. "
264+
"GeoNode requires Latitude and Longitude labels in the first row."
265+
)
266+
267+
# Integrity Check (No Empty Names)
268+
if any(h == "" for h in clean_headers):
269+
raise InvalidInputFileException(detail="One or more columns in the first row are missing a header name.")
270+
271+
# Uniqueness Check
272+
if len(clean_headers) != len(set(clean_headers)):
273+
duplicates = set([h for h in clean_headers if clean_headers.count(h) > 1])
274+
raise InvalidInputFileException(detail=f"Duplicate headers found in Row 1: {', '.join(duplicates)}")
275+
276+
return True
277+
278+
def _data_sense_check(self, x, y):
279+
"""
280+
High-speed coordinate validation for large datasets
281+
"""
282+
try:
283+
# Catch Excel Date objects immediately (Calamine returns these as datetime)
284+
if isinstance(x, datetime) or isinstance(y, datetime):
285+
return False
286+
287+
f_x = float(x)
288+
f_y = float(y)
289+
290+
# Finiteness check (Catches NaN, Inf, and None)
291+
# This is extremely fast in Python
292+
if not (math.isfinite(f_x) and math.isfinite(f_y)):
293+
return False
294+
295+
# Magnitude check
296+
# Limits to +/- 40 million (covers all CRS including Web Mercator)
297+
# but blocks 'serial date numbers' or corrupted scientific notation
298+
if not (-40000000 < f_x < 40000000 and -40000000 < f_y < 40000000):
299+
return False
300+
301+
return True
302+
except (ValueError, TypeError):
303+
return False
304+
305+
def _detect_empty_rows(self, row):
306+
return not row or all(cell is None or str(cell).strip() == "" for cell in row)
307+
308+
def _convert_to_csv(self, headers, rows_gen, output_path):
309+
"""Streams valid data to CSV, skipping empty rows."""
310+
311+
# Define clean_headers once here to find the indices
312+
clean_headers = [str(h).strip().lower() for h in headers]
313+
314+
# Get the indices for the Lat and Lon columns
315+
lat_idx = next(i for i, h in enumerate(clean_headers) if h in self.lat_names)
316+
lon_idx = next(i for i, h in enumerate(clean_headers) if h in self.lon_names)
317+
318+
# Local binding of the check function for loop speed
319+
check_func = self._data_sense_check
320+
321+
with open(output_path, "w", newline="", encoding="utf-8") as f:
322+
writer = csv.writer(f)
323+
writer.writerow(headers)
324+
325+
for row_num, row in enumerate(rows_gen, start=2):
326+
# Skip row if it contains no data
327+
if self._detect_empty_rows(row):
328+
continue
329+
330+
if not check_func(row[lon_idx], row[lat_idx]):
331+
raise InvalidInputFileException(
332+
detail=f"Coordinate error at row {row_num}. "
333+
"Check for dates or non-numeric values in Lat/Lon."
334+
)
335+
336+
writer.writerow(row)

0 commit comments

Comments
 (0)