Fixed bug when using direct path load with a single byte character set (#567)

anthony-tuininga · anthony-tuininga · commit cbb14426fe2b · 2026-03-24T09:40:02.000-06:00
and when encoding/decoding strings in database objects (#371).
diff --git a/doc/src/release_notes.rst b/doc/src/release_notes.rst
@@ -19,6 +19,11 @@ oracledb `4.0.0 <https://github.com/oracle/python-oracledb/compare/v3.4.2...v4.0
 Thin Mode Changes
 +++++++++++++++++
 
+#)  Fixed bug when using direct path load with a single byte database character
+    set
+    (`issue 567 <https://github.com/oracle/python-oracledb/issues/567>`__).
+#)  Fixed bug when decoding/encoding strings found within database objects
+    (`issue 371 <https://github.com/oracle/python-oracledb/issues/371>`__).
 #)  Fixed bug when unexpected error is thrown during authentication when using
     tokens
     (`issue 542 <https://github.com/oracle/python-oracledb/issues/542>`__).
diff --git a/src/oracledb/base_impl.pxd b/src/oracledb/base_impl.pxd
@@ -1053,11 +1053,13 @@ cdef int convert_oracle_data_to_arrow(OracleMetadata from_metadata,
 cdef object convert_oracle_data_to_python(OracleMetadata from_metadata,
                                           OracleMetadata to_metadatda,
                                           OracleData* data,
+                                          const char* encoding,
                                           const char* encoding_errors,
                                           bint from_dbobject)
 cdef object convert_python_to_oracle_data(OracleMetadata metadata,
                                           OracleData* data,
-                                          object value)
+                                          object value,
+                                          const char* encoding)
 cdef int convert_vector_to_arrow(ArrowArrayImpl array_impl,
                                  object vector) except -1
 cdef cydatetime.datetime convert_date_to_python(OracleDataBuffer *buffer)
diff --git a/src/oracledb/errors.py b/src/oracledb/errors.py
@@ -335,6 +335,7 @@ def _raise_not_supported(feature: str) -> None:
 ERR_UNSUPPORTED_ARROW_TYPE = 3037
 ERR_CANNOT_CONVERT_TO_ARROW_TYPE = 3038
 ERR_CANNOT_CONVERT_FROM_ARROW_TYPE = 3039
+ERR_DB_CS_NOT_SUPPORTED = 3040
 
 # error numbers that result in DatabaseError
 ERR_TNS_ENTRY_NOT_FOUND = 4000
@@ -641,6 +642,10 @@ def _raise_not_supported(feature: str) -> None:
     ),
     ERR_CURSOR_HAS_BEEN_CLOSED: "cursor has been closed by the database",
     ERR_CURSOR_NOT_OPEN: "cursor is not open",
+    ERR_DB_CS_NOT_SUPPORTED: (
+        "database character set id {charset_id} is not supported by "
+        "python-oracledb in thin mode"
+    ),
     ERR_DBOBJECT_ATTR_MAX_SIZE_VIOLATED: (
         "attribute {attr_name} of type {type_name} exceeds its maximum size "
         "(actual: {actual_size}, maximum: {max_size})"
diff --git a/src/oracledb/impl/base/converters.pyx b/src/oracledb/impl/base/converters.pyx
@@ -418,16 +418,15 @@ cdef int convert_str_to_arrow(ArrowArrayImpl array_impl,
     array_impl.append_bytes(<void*> rb.ptr, rb.num_bytes)
 
 
-cdef object convert_str_to_python(OracleDataBuffer *buffer, uint8_t csfrm,
+cdef object convert_str_to_python(OracleDataBuffer *buffer,
+                                  const char* encoding,
                                   const char* encoding_errors):
     """
     Converts a CHAR, NCHAR, LONG, VARCHAR, or NVARCHAR value stored in the
     buffer to Python string.
     """
     cdef OracleRawBytes *rb = &buffer.as_raw_bytes
-    if csfrm == CS_FORM_IMPLICIT:
-        return rb.ptr[:rb.num_bytes].decode(ENCODING_UTF8, encoding_errors)
-    return rb.ptr[:rb.num_bytes].decode(ENCODING_UTF16, encoding_errors)
+    return rb.ptr[:rb.num_bytes].decode(encoding, encoding_errors)
 
 
 cdef int convert_oracle_data_to_arrow(OracleMetadata from_metadata,
@@ -499,21 +498,20 @@ cdef int convert_oracle_data_to_arrow(OracleMetadata from_metadata,
 cdef object convert_oracle_data_to_python(OracleMetadata from_metadata,
                                           OracleMetadata to_metadata,
                                           OracleData* data,
+                                          const char* encoding,
                                           const char* encoding_errors,
                                           bint from_dbobject):
     """
     Converts the value stored in OracleData to a Python object.
     """
-    cdef:
-        uint8_t py_type_num, ora_type_num, csfrm
+    cdef uint8_t py_type_num, ora_type_num
 
     # NULL values
     if data.is_null:
         return None
 
     # reduce typing
     ora_type_num = from_metadata.dbtype._ora_type_num
-    csfrm = from_metadata.dbtype._csfrm
     py_type_num = to_metadata._py_type_num
 
     # Python bytes
@@ -539,7 +537,8 @@ cdef object convert_oracle_data_to_python(OracleMetadata from_metadata,
             ORA_TYPE_NUM_LONG,
             ORA_TYPE_NUM_VARCHAR
         ):
-            return convert_str_to_python(&data.buffer, csfrm, encoding_errors)
+            return convert_str_to_python(&data.buffer, encoding,
+                                         encoding_errors)
 
         # Oracle NUMBER
         elif ora_type_num == ORA_TYPE_NUM_NUMBER:
@@ -591,7 +590,8 @@ cdef object convert_oracle_data_to_python(OracleMetadata from_metadata,
             ORA_TYPE_NUM_LONG,
             ORA_TYPE_NUM_VARCHAR
         ):
-            value = convert_str_to_python(&data.buffer, csfrm, encoding_errors)
+            value = convert_str_to_python(&data.buffer, encoding,
+                                          encoding_errors)
             return int(PY_TYPE_DECIMAL(value))
 
         # Oracle BINARY_DOUBLE
@@ -674,7 +674,8 @@ cdef object convert_oracle_data_to_python(OracleMetadata from_metadata,
 
 cdef object convert_python_to_oracle_data(OracleMetadata metadata,
                                           OracleData* data,
-                                          object value):
+                                          object value,
+                                          const char* encoding):
     """
     Converts a Python value to the OracleData structure. The object returned is
     any temporary object that is required to be retained (if any).
@@ -688,10 +689,7 @@ cdef object convert_python_to_oracle_data(OracleMetadata metadata,
     elif ora_type_num in (ORA_TYPE_NUM_VARCHAR,
                           ORA_TYPE_NUM_CHAR,
                           ORA_TYPE_NUM_LONG):
-        if metadata.dbtype._csfrm == CS_FORM_IMPLICIT:
-            temp_bytes = (<str> value).encode()
-        else:
-            temp_bytes = (<str> value).encode(ENCODING_UTF16)
+        temp_bytes = (<str> value).encode(encoding)
         convert_bytes_to_oracle_data(&data.buffer, temp_bytes)
         if data.buffer.as_raw_bytes.num_bytes == 0:
             data.is_null = True
diff --git a/src/oracledb/impl/thin/capabilities.pyx b/src/oracledb/impl/thin/capabilities.pyx
@@ -1,5 +1,5 @@
 #------------------------------------------------------------------------------
-# Copyright (c) 2021, 2025, Oracle and/or its affiliates.
+# Copyright (c) 2021, 2026, Oracle and/or its affiliates.
 #
 # This software is dual-licensed to you under the Universal Permissive License
 # (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License
@@ -30,12 +30,70 @@
 # thin_impl.pyx).
 #------------------------------------------------------------------------------
 
+# defines the mapping between Oracle Database character set and IANA encoding
+# names used by Python
+cdef dict ORACLE_CHARSET_TO_PYTHON_ENCODING = {
+    # ASCII
+    1: "ascii",                         # US7ASCII
+
+    # ISO 8859 series
+    31: "iso_8859_1",                   # WE8ISO8859P1
+    32: "iso_8859_2",                   # EE8ISO8859P2
+    33: "iso_8859_3",                   # SE8ISO8859P3
+    34: "iso_8859_4",                   # NEE8ISO8859P4
+    35: "iso_8859_5",                   # CL8ISO8859P5
+    36: "iso_8859_6",                   # AR8ISO8859P6
+    37: "iso_8859_7",                   # EL8ISO8859P7
+    38: "iso_8859_8",                   # IW8ISO8859P8
+    39: "iso_8859_9",                   # WE8ISO8859P9
+    40: "iso_8859_10",                  # NE8ISO8859P10
+    41: "tis_620",                      # TH8TISASCII
+    46: "iso_8859_15",                  # WE8ISO8859P15
+    47: "iso_8859_13",                  # BLT8ISO8859P13
+
+    # Windows code pages
+    170: "cp1250",                      # EE8MSWIN1250
+    171: "cp1251",                      # CL8MSWIN1251
+    172: "cp1253",                      # EL8MSWIN1253
+    173: "cp1254",                      # TR8MSWIN1254
+    174: "cp1255",                      # IW8MSWIN1255
+    175: "cp1256",                      # AR8MSWIN1256
+    176: "cp1257",                      # BLT8MSWIN1257
+    177: "cp1258",                      # VN8MSWIN1258
+    178: "cp1252",                      # WE8MSWIN1252
+
+    # DOS / PC code pages
+    351: "cp850",                       # WE8PC850
+    354: "cp437",                       # US8PC437
+    368: "cp866",                       # RU8PC866
+    382: "cp852",                       # EE8PC852
+
+    # East Asian multi-byte
+    829: "big5",                        # ZHT16BIG5
+    830: "euc_kr",                      # KO16KSC5601
+    831: "euc_jp",                      # JA16EUC
+    832: "cp932",                       # JA16SJIS
+    833: "cp932",                       # JA16SJISTILDE
+    834: "euc_jp",                      # JA16EUCTILDE
+    846: "gbk",                         # ZHS16GBK
+    850: "big5hkscs",                   # ZHT16HKSCS
+    852: "euc_kr",                      # KO16MSWIN949
+    854: "big5",                        # ZHT16MSWIN950
+    870: "gb18030",                     # ZHS32GB18030
+
+    # universal encodings
+    873: "utf_8",                       # AL32UTF8
+    2000: "utf_16_be",                  # AL16UTF16
+}
+
 cdef class Capabilities:
     cdef:
         uint16_t protocol_version
         uint8_t ttc_field_version
         uint16_t charset_id
+        const char* encoding
         uint16_t ncharset_id
+        const char* nencoding
         bytearray compile_caps
         bytearray runtime_caps
         uint32_t max_string_size
@@ -87,14 +145,36 @@ cdef class Capabilities:
         if not (server_caps[TNS_RCAP_TTC] & TNS_RCAP_TTC_SESSION_STATE_OPS):
             self.supports_request_boundaries = False
 
-    cdef int _check_ncharset_id(self) except -1:
+    cdef const char* _get_encoding(self) except NULL:
+        """
+        Returns the encoding to use for encoding or decoding data that is
+        stored in the database character set. If no encoding is found, an
+        exception is raised. This is only required for direct path load and for
+        strings found within Oracle database objects.
+        """
+        cdef str encoding
+        if self.encoding != NULL:
+            return self.encoding
+        encoding = ORACLE_CHARSET_TO_PYTHON_ENCODING.get(self.charset_id)
+        if encoding is None:
+            errors._raise_err(errors.ERR_DB_CS_NOT_SUPPORTED,
+                              charset_id=self.charset_id)
+        return encoding.encode()
+
+    cdef const char* _get_nencoding(self) except NULL:
         """
-        Checks that the national character set id is AL16UTF16, which is the
-        only id that is currently supported.
+        Returns the encoding to use for encoding or decoding data that is
+        stored in the database national character set. If no encoding is found,
+        an exception is raised. This is required for handling NCHAR data.
         """
-        if self.ncharset_id != TNS_CHARSET_UTF16:
+        cdef str encoding
+        if self.nencoding != NULL:
+            return self.nencoding
+        encoding = ORACLE_CHARSET_TO_PYTHON_ENCODING.get(self.ncharset_id)
+        if encoding is None:
             errors._raise_err(errors.ERR_NCHAR_CS_NOT_SUPPORTED,
                               charset_id=self.ncharset_id)
+        return encoding.encode()
 
     @cython.boundscheck(False)
     cdef void _init_compile_caps(self):
diff --git a/src/oracledb/impl/thin/dbobject.pyx b/src/oracledb/impl/thin/dbobject.pyx
@@ -1,5 +1,5 @@
 #------------------------------------------------------------------------------
-# Copyright (c) 2022, 2025, Oracle and/or its affiliates.
+# Copyright (c) 2022, 2026, Oracle and/or its affiliates.
 #
 # This software is dual-licensed to you under the Universal Permissive License
 # (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl and Apache License
@@ -106,6 +106,7 @@ cdef class DbObjectPickleBuffer(GrowableBuffer):
         cdef:
             uint8_t image_flags, image_version
             BaseThinLobImpl lob_impl
+            const char* encoding
             const char_type *ptr
             ssize_t bytes_left
             uint32_t xml_flag
@@ -118,7 +119,8 @@ cdef class DbObjectPickleBuffer(GrowableBuffer):
         bytes_left = self.bytes_left()
         ptr = self.read_raw_bytes(bytes_left)
         if xml_flag & TNS_XML_TYPE_STRING:
-            return ptr[:bytes_left].decode()
+            encoding = conn_impl._protocol._caps._get_encoding()
+            return ptr[:bytes_left].decode(encoding)
         elif xml_flag & TNS_XML_TYPE_LOB:
             lob_impl = conn_impl._create_lob_impl(DB_TYPE_CLOB,
                                                   ptr[:bytes_left])
@@ -250,8 +252,10 @@ cdef class ThinDbObjectImpl(BaseDbObjectImpl):
         """
         cdef:
             uint8_t ora_type_num = metadata.dbtype._ora_type_num
+            BaseThinConnImpl conn_impl
             ThinDbObjectImpl obj_impl
             BaseThinLobImpl lob_impl
+            const char* encoding
             bytes temp_bytes
         if value is None:
             if metadata.objtype is not None \
@@ -260,10 +264,12 @@ cdef class ThinDbObjectImpl(BaseDbObjectImpl):
             else:
                 buf.write_uint8(TNS_NULL_LENGTH_INDICATOR)
         elif ora_type_num in (ORA_TYPE_NUM_CHAR, ORA_TYPE_NUM_VARCHAR):
+            conn_impl = self.type._conn_impl
             if metadata.dbtype._csfrm == CS_FORM_IMPLICIT:
-                temp_bytes = (<str> value).encode()
+                encoding = conn_impl._protocol._caps._get_encoding()
             else:
-                temp_bytes = (<str> value).encode(ENCODING_UTF16)
+                encoding = conn_impl._protocol._caps._get_nencoding()
+            temp_bytes = (<str> value).encode(encoding)
             buf.write_bytes_with_length(temp_bytes)
         elif ora_type_num == ORA_TYPE_NUM_NUMBER:
             temp_bytes = (<str> cpython.PyObject_Str(value)).encode()
@@ -353,19 +359,19 @@ cdef class ThinDbObjectImpl(BaseDbObjectImpl):
         """
         cdef:
             uint8_t ora_type_num = metadata.dbtype._ora_type_num
+            BaseThinConnImpl conn_impl = self.type._conn_impl
             uint8_t csfrm = metadata.dbtype._csfrm
             DbObjectPickleBuffer xml_buf
             bint is_null, is_collection
-            BaseThinConnImpl conn_impl
             ThinDbObjectImpl obj_impl
             BaseThinLobImpl lob_impl
+            const char* encoding
             OracleData data
             bytes locator
             type cls
         if ora_type_num in (ORA_TYPE_NUM_CLOB,
-                              ORA_TYPE_NUM_BLOB,
-                              ORA_TYPE_NUM_BFILE):
-            conn_impl = self.type._conn_impl
+                            ORA_TYPE_NUM_BLOB,
+                            ORA_TYPE_NUM_BFILE):
             locator = buf.read_bytes()
             if locator is None:
                 return None
@@ -381,7 +387,7 @@ cdef class ThinDbObjectImpl(BaseDbObjectImpl):
                     return None
                 xml_buf = DbObjectPickleBuffer.__new__(DbObjectPickleBuffer)
                 xml_buf._populate_from_bytes(xml_bytes)
-                return xml_buf.read_xmltype(self.type._conn_impl)
+                return xml_buf.read_xmltype(conn_impl)
             is_collection = \
                     metadata.objtype.is_collection or self.type.is_collection
             buf.get_is_atomic_null(is_collection, &is_null)
@@ -396,11 +402,12 @@ cdef class ThinDbObjectImpl(BaseDbObjectImpl):
             return PY_TYPE_DB_OBJECT._from_impl(obj_impl)
         buf.read_oracle_data(metadata, &data, from_dbobject=True,
                              decode_str=False)
-        if metadata.dbtype._csfrm == CS_FORM_NCHAR:
-            conn_impl = self.type._conn_impl
-            conn_impl._protocol._caps._check_ncharset_id()
+        if metadata.dbtype._csfrm == CS_FORM_IMPLICIT:
+            encoding = conn_impl._protocol._caps._get_encoding()
+        else:
+            encoding = conn_impl._protocol._caps._get_nencoding()
         return convert_oracle_data_to_python(metadata, metadata, &data,
-                                             encoding_errors=NULL,
+                                             encoding, encoding_errors=NULL,
                                              from_dbobject=True)
 
     def append_checked(self, object value):
diff --git a/src/oracledb/impl/thin/messages/base.pyx b/src/oracledb/impl/thin/messages/base.pyx
@@ -996,6 +996,7 @@ cdef class MessageWithData(Message):
             uint8_t num_bytes, ora_type_num, csfrm
             ThinDbObjectTypeImpl typ_impl
             BaseThinCursorImpl cursor_impl
+            const char *encoding = NULL
             object column_value = None
             ThinDbObjectImpl obj_impl
             int32_t actual_num_bytes
@@ -1074,14 +1075,14 @@ cdef class MessageWithData(Message):
                 decode_str=self.cursor_impl.fetching_arrow
             )
             if metadata.dbtype._csfrm == CS_FORM_NCHAR:
-                buf._caps._check_ncharset_id()
+                encoding = buf._caps._get_nencoding()
             if self.cursor_impl.fetching_arrow:
                 convert_oracle_data_to_arrow(
                     metadata, var_impl.metadata, &data, var_impl._arrow_array
                 )
             else:
                 column_value = convert_oracle_data_to_python(
-                    metadata, var_impl.metadata, &data,
+                    metadata, var_impl.metadata, &data, encoding,
                     var_impl._encoding_errors, from_dbobject=False
                 )
         if not self.in_fetch:
@@ -1396,6 +1397,7 @@ cdef class MessageWithData(Message):
         cdef:
             ThinDbObjectTypeImpl typ_impl
             BaseThinCursorImpl cursor_impl
+            const char* encoding = NULL
             BaseThinLobImpl lob_impl
             OracleMetadata metadata
             uint8_t ora_type_num
@@ -1409,8 +1411,11 @@ cdef class MessageWithData(Message):
             value = convert_arrow_to_oracle_data(metadata, &data,
                                                  var_impl._arrow_array, offset)
         else:
+            if metadata.dbtype._csfrm == CS_FORM_NCHAR:
+                encoding = ENCODING_UTF16
             value = convert_python_to_oracle_data(metadata, &data,
-                                                  var_impl._values[offset])
+                                                  var_impl._values[offset],
+                                                  encoding)
         ora_type_num = metadata.dbtype._ora_type_num
         if data.is_null:
             if ora_type_num == ORA_TYPE_NUM_BOOLEAN:
diff --git a/src/oracledb/impl/thin/messages/direct_path_load_stream.pyx b/src/oracledb/impl/thin/messages/direct_path_load_stream.pyx
diff --git a/src/oracledb/impl/thin/messages/direct_path_prepare.pyx b/src/oracledb/impl/thin/messages/direct_path_prepare.pyx
diff --git a/src/oracledb/impl/thin/messages/lob_op.pyx b/src/oracledb/impl/thin/messages/lob_op.pyx