|
| 1 | +######################################################################### |
| 2 | +# |
| 3 | +# Copyright (C) 2024 OSGeo |
| 4 | +# |
| 5 | +# This program is free software: you can redistribute it and/or modify |
| 6 | +# it under the terms of the GNU General Public License as published by |
| 7 | +# the Free Software Foundation, either version 3 of the License, or |
| 8 | +# (at your option) any later version. |
| 9 | +# |
| 10 | +# This program is distributed in the hope that it will be useful, |
| 11 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 13 | +# GNU General Public License for more details. |
| 14 | +# |
| 15 | +# You should have received a copy of the GNU General Public License |
| 16 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 17 | +# |
| 18 | +######################################################################### |
| 19 | +import logging |
| 20 | +from pathlib import Path |
| 21 | +import csv |
| 22 | +from datetime import datetime |
| 23 | +import math |
| 24 | +from celery import group |
| 25 | +from python_calamine import CalamineWorkbook |
| 26 | +from osgeo import ogr |
| 27 | + |
| 28 | +from dynamic_models.models import ModelSchema |
| 29 | +from django.conf import settings |
| 30 | + |
| 31 | +from geonode.upload.handlers.common.vector import BaseVectorFileHandler |
| 32 | +from geonode.upload.handlers.csv.handler import CSVFileHandler |
| 33 | +from geonode.upload.celery_tasks import create_dynamic_structure |
| 34 | +from geonode.upload.handlers.utils import GEOM_TYPE_MAPPING |
| 35 | +from geonode.upload.api.exceptions import InvalidInputFileException |
| 36 | + |
| 37 | +logger = logging.getLogger("importer") |
| 38 | + |
| 39 | + |
| 40 | +class XLSXFileHandler(CSVFileHandler): |
| 41 | + |
| 42 | + XLSX_UPLOAD_ENABLED = getattr(settings, "XLSX_UPLOAD_ENABLED", False) |
| 43 | + |
| 44 | + lat_names = CSVFileHandler.possible_lat_column |
| 45 | + lon_names = CSVFileHandler.possible_long_column |
| 46 | + |
| 47 | + @classmethod |
| 48 | + def is_xlsx_enabled(cls): |
| 49 | + """ |
| 50 | + Unified check for the feature toggle. |
| 51 | + Returns True if enabled, None if disabled. |
| 52 | + """ |
| 53 | + if not cls.XLSX_UPLOAD_ENABLED: |
| 54 | + return None |
| 55 | + return True |
| 56 | + |
| 57 | + @property |
| 58 | + def supported_file_extension_config(self): |
| 59 | + |
| 60 | + # If disabled, return an empty list or None so the UI doesn't show XLSX options |
| 61 | + if not self.is_xlsx_enabled(): |
| 62 | + return None |
| 63 | + |
| 64 | + return { |
| 65 | + "id": "excel", # Use a generic ID that doesn't imply a specific extension |
| 66 | + "formats": [ |
| 67 | + { |
| 68 | + "label": "Excel (xlsx)", |
| 69 | + "required_ext": ["xlsx"], |
| 70 | + "optional_ext": ["sld", "xml"], |
| 71 | + }, |
| 72 | + { |
| 73 | + "label": "Excel (xls)", |
| 74 | + "required_ext": ["xls"], |
| 75 | + "optional_ext": ["sld", "xml"], |
| 76 | + }, |
| 77 | + ], |
| 78 | + "actions": list(self.TASKS.keys()), |
| 79 | + "type": "vector", |
| 80 | + } |
| 81 | + |
| 82 | + @staticmethod |
| 83 | + def can_handle(_data) -> bool: |
| 84 | + """ |
| 85 | + This endpoint will return True or False if with the info provided |
| 86 | + the handler is able to handle the file or not |
| 87 | + """ |
| 88 | + # Availability Check for the back-end |
| 89 | + if not XLSXFileHandler.is_xlsx_enabled(): |
| 90 | + return False |
| 91 | + |
| 92 | + base = _data.get("base_file") |
| 93 | + if not base: |
| 94 | + return False |
| 95 | + |
| 96 | + # Support both XLSX and XLS |
| 97 | + valid_extensions = (".xlsx", ".xls") |
| 98 | + |
| 99 | + is_excel = ( |
| 100 | + base.lower().endswith(valid_extensions) |
| 101 | + if isinstance(base, str) |
| 102 | + else base.name.lower().endswith(valid_extensions) |
| 103 | + ) |
| 104 | + |
| 105 | + return is_excel and BaseVectorFileHandler.can_handle(_data) |
| 106 | + |
| 107 | + @staticmethod |
| 108 | + def is_valid(files, user, **kwargs): |
| 109 | + from geonode.upload.utils import UploadLimitValidator |
| 110 | + |
| 111 | + # Basic GeoNode validation |
| 112 | + BaseVectorFileHandler.is_valid(files, user) |
| 113 | + |
| 114 | + # Parallelism check (This is fast and doesn't need to open the file) |
| 115 | + upload_validator = UploadLimitValidator(user) |
| 116 | + upload_validator.validate_parallelism_limit_per_user() |
| 117 | + |
| 118 | + # We handle the deep inspection (lat/lon) later. |
| 119 | + return True |
| 120 | + |
| 121 | + @staticmethod |
| 122 | + def create_ogr2ogr_command(files, original_name, ovverwrite_layer, alternate, **kwargs): |
| 123 | + """ |
| 124 | + Customized for XLSX: Only looks for X/Y (Point) data. |
| 125 | + Sanitized with shlex.quote to prevent Command Injection. |
| 126 | + """ |
| 127 | + |
| 128 | + # Pass the safe versions to the base handler |
| 129 | + base_command = BaseVectorFileHandler.create_ogr2ogr_command(files, original_name, ovverwrite_layer, alternate) |
| 130 | + |
| 131 | + # Define mapping (these are safe as they are class-level constants) |
| 132 | + lat_mapping = ",".join(XLSXFileHandler.lat_names) |
| 133 | + lon_mapping = ",".join(XLSXFileHandler.lon_names) |
| 134 | + |
| 135 | + additional_option = ( |
| 136 | + f' -oo "X_POSSIBLE_NAMES={lon_mapping}" ' f'-oo "Y_POSSIBLE_NAMES={lat_mapping}" ' f'-nln "{alternate}"' |
| 137 | + ) |
| 138 | + |
| 139 | + # Return the combined, safe command string |
| 140 | + return ( |
| 141 | + f"{base_command} -oo KEEP_GEOM_COLUMNS=NO " |
| 142 | + f"-lco GEOMETRY_NAME={BaseVectorFileHandler().default_geometry_column_name} " |
| 143 | + f"{additional_option}" |
| 144 | + ) |
| 145 | + |
| 146 | + def create_dynamic_model_fields( |
| 147 | + self, |
| 148 | + layer: str, |
| 149 | + dynamic_model_schema: ModelSchema = None, |
| 150 | + overwrite: bool = None, |
| 151 | + execution_id: str = None, |
| 152 | + layer_name: str = None, |
| 153 | + return_celery_group: bool = True, |
| 154 | + ): |
| 155 | + # retrieving the field schema from ogr2ogr and converting the type to Django Types |
| 156 | + layer_schema = [{"name": x.name.lower(), "class_name": self._get_type(x), "null": True} for x in layer.schema] |
| 157 | + |
| 158 | + class_name = GEOM_TYPE_MAPPING.get(self.promote_to_multi("Point")) |
| 159 | + # Get the geometry type name from OGR (e.g., 'Point' or 'Point 25D') |
| 160 | + geom_type_name = ogr.GeometryTypeToName(layer.GetGeomType()) |
| 161 | + |
| 162 | + layer_schema += [ |
| 163 | + { |
| 164 | + "name": layer.GetGeometryColumn() or self.default_geometry_column_name, |
| 165 | + "class_name": class_name, |
| 166 | + "dim": (3 if geom_type_name.lower().startswith("3d") or "z" in geom_type_name.lower() else 2), |
| 167 | + } |
| 168 | + ] |
| 169 | + |
| 170 | + if not return_celery_group: |
| 171 | + return layer_schema |
| 172 | + |
| 173 | + list_chunked = [layer_schema[i : i + 30] for i in range(0, len(layer_schema), 30)] |
| 174 | + celery_group = group( |
| 175 | + create_dynamic_structure.s(execution_id, schema, dynamic_model_schema.id, overwrite, layer_name) |
| 176 | + for schema in list_chunked |
| 177 | + ) |
| 178 | + |
| 179 | + return dynamic_model_schema, celery_group |
| 180 | + |
| 181 | + def pre_processing(self, files, execution_id, **kwargs): |
| 182 | + from geonode.upload.orchestrator import orchestrator |
| 183 | + |
| 184 | + # calling the super function (CSVFileHandler logic) |
| 185 | + _data, execution_id = super().pre_processing(files, execution_id, **kwargs) |
| 186 | + |
| 187 | + # convert the XLSX file into a CSV |
| 188 | + xlsx_file = _data.get("files", {}).get("base_file", "") |
| 189 | + if not xlsx_file: |
| 190 | + raise InvalidInputFileException(detail="The base file was not found in the upload payload.") |
| 191 | + |
| 192 | + output_file = str(Path(xlsx_file).with_suffix(".csv")) |
| 193 | + |
| 194 | + try: |
| 195 | + workbook = CalamineWorkbook.from_path(xlsx_file) |
| 196 | + |
| 197 | + # Sheet Validation (Uses the validated sheet name) |
| 198 | + sheet_name = self._validate_sheets(workbook) |
| 199 | + sheet = workbook.get_sheet_by_name(sheet_name) |
| 200 | + |
| 201 | + # We iterate until we find the first non-empty row |
| 202 | + rows_gen = iter(sheet.to_python()) |
| 203 | + try: |
| 204 | + # We strictly take the first row. No skipping allowed. |
| 205 | + headers = next(rows_gen) |
| 206 | + except StopIteration: |
| 207 | + raise InvalidInputFileException(detail="The file is empty.") |
| 208 | + |
| 209 | + # Restrictive File Structure Validation |
| 210 | + self._validate_headers(headers) |
| 211 | + |
| 212 | + # Conversion with row cleanup |
| 213 | + # Note: rows_gen continues from the row after the headers |
| 214 | + self._convert_to_csv(headers, rows_gen, output_file) |
| 215 | + |
| 216 | + except Exception as e: |
| 217 | + logger.exception("XLSX Pre-processing failed") |
| 218 | + raise InvalidInputFileException(detail=f"Failed to securely parse Excel: {str(e)}") |
| 219 | + |
| 220 | + # update the file path in the payload |
| 221 | + _data["files"]["base_file"] = output_file |
| 222 | + |
| 223 | + if "temporary_files" not in _data or not isinstance(_data["temporary_files"], dict): |
| 224 | + _data["temporary_files"] = {} |
| 225 | + |
| 226 | + _data["temporary_files"]["base_file"] = output_file |
| 227 | + |
| 228 | + # updating the execution id params |
| 229 | + orchestrator.update_execution_request_obj( |
| 230 | + orchestrator.get_execution_object(execution_id), {"input_params": _data} |
| 231 | + ) |
| 232 | + return _data, execution_id |
| 233 | + |
| 234 | + def _validate_sheets(self, workbook): |
| 235 | + """Returns the first sheet name and logs warnings if others exist.""" |
| 236 | + sheets = workbook.sheet_names |
| 237 | + if not sheets: |
| 238 | + raise InvalidInputFileException(detail="No sheets found in workbook.") |
| 239 | + if len(sheets) > 1: |
| 240 | + logger.warning(f"Multiple sheets found. Ignoring: {sheets[1:]}") |
| 241 | + return sheets[0] |
| 242 | + |
| 243 | + def _validate_headers(self, headers): |
| 244 | + """ |
| 245 | + Strictly validates Row 1 for headers: |
| 246 | + - Must not be empty. |
| 247 | + - Must contain geometry 'fingerprints' (Lat/Lon). |
| 248 | + - Must have unique and non-empty column names. |
| 249 | + """ |
| 250 | + # Existence Check |
| 251 | + if not headers or self._detect_empty_rows(headers): |
| 252 | + raise InvalidInputFileException(detail="No data or headers found in the selected sheet.") |
| 253 | + |
| 254 | + # Normalization |
| 255 | + clean_headers = [str(h).strip().lower() if h is not None else "" for h in headers] |
| 256 | + |
| 257 | + # Geometry Fingerprint Check |
| 258 | + has_lat = any(h in self.lat_names for h in clean_headers) |
| 259 | + has_lon = any(h in self.lon_names for h in clean_headers) |
| 260 | + |
| 261 | + if not (has_lat and has_lon): |
| 262 | + raise InvalidInputFileException( |
| 263 | + detail="The headers do not contain valid geometry headers. " |
| 264 | + "GeoNode requires Latitude and Longitude labels in the first row." |
| 265 | + ) |
| 266 | + |
| 267 | + # Integrity Check (No Empty Names) |
| 268 | + if any(h == "" for h in clean_headers): |
| 269 | + raise InvalidInputFileException(detail="One or more columns in the first row are missing a header name.") |
| 270 | + |
| 271 | + # Uniqueness Check |
| 272 | + if len(clean_headers) != len(set(clean_headers)): |
| 273 | + duplicates = set([h for h in clean_headers if clean_headers.count(h) > 1]) |
| 274 | + raise InvalidInputFileException(detail=f"Duplicate headers found in Row 1: {', '.join(duplicates)}") |
| 275 | + |
| 276 | + return True |
| 277 | + |
| 278 | + def _data_sense_check(self, x, y): |
| 279 | + """ |
| 280 | + High-speed coordinate validation for large datasets |
| 281 | + """ |
| 282 | + try: |
| 283 | + # Catch Excel Date objects immediately (Calamine returns these as datetime) |
| 284 | + if isinstance(x, datetime) or isinstance(y, datetime): |
| 285 | + return False |
| 286 | + |
| 287 | + f_x = float(x) |
| 288 | + f_y = float(y) |
| 289 | + |
| 290 | + # Finiteness check (Catches NaN, Inf, and None) |
| 291 | + # This is extremely fast in Python |
| 292 | + if not (math.isfinite(f_x) and math.isfinite(f_y)): |
| 293 | + return False |
| 294 | + |
| 295 | + # Magnitude check |
| 296 | + # Limits to +/- 40 million (covers all CRS including Web Mercator) |
| 297 | + # but blocks 'serial date numbers' or corrupted scientific notation |
| 298 | + if not (-40000000 < f_x < 40000000 and -40000000 < f_y < 40000000): |
| 299 | + return False |
| 300 | + |
| 301 | + return True |
| 302 | + except (ValueError, TypeError): |
| 303 | + return False |
| 304 | + |
| 305 | + def _detect_empty_rows(self, row): |
| 306 | + return not row or all(cell is None or str(cell).strip() == "" for cell in row) |
| 307 | + |
| 308 | + def _convert_to_csv(self, headers, rows_gen, output_path): |
| 309 | + """Streams valid data to CSV, skipping empty rows.""" |
| 310 | + |
| 311 | + # Define clean_headers once here to find the indices |
| 312 | + clean_headers = [str(h).strip().lower() for h in headers] |
| 313 | + |
| 314 | + # Get the indices for the Lat and Lon columns |
| 315 | + lat_idx = next(i for i, h in enumerate(clean_headers) if h in self.lat_names) |
| 316 | + lon_idx = next(i for i, h in enumerate(clean_headers) if h in self.lon_names) |
| 317 | + |
| 318 | + # Local binding of the check function for loop speed |
| 319 | + check_func = self._data_sense_check |
| 320 | + |
| 321 | + with open(output_path, "w", newline="", encoding="utf-8") as f: |
| 322 | + writer = csv.writer(f) |
| 323 | + writer.writerow(headers) |
| 324 | + |
| 325 | + for row_num, row in enumerate(rows_gen, start=2): |
| 326 | + # Skip row if it contains no data |
| 327 | + if self._detect_empty_rows(row): |
| 328 | + continue |
| 329 | + |
| 330 | + if not check_func(row[lon_idx], row[lat_idx]): |
| 331 | + raise InvalidInputFileException( |
| 332 | + detail=f"Coordinate error at row {row_num}. " |
| 333 | + "Check for dates or non-numeric values in Lat/Lon." |
| 334 | + ) |
| 335 | + |
| 336 | + writer.writerow(row) |
0 commit comments