From 23338f1e172fb1fc5c03a004df78ed7ed26dff31 Mon Sep 17 00:00:00 2001 From: Mark Molinaro Date: Sat, 13 Jun 2026 06:52:38 +0000 Subject: [PATCH 1/2] python: avoid utf8 copy during fastbinary string encode Use PyUnicode_AsUTF8AndSize when available, with a fallback for older Python 3 releases, so UTF-8 strings can be encoded without allocating an intermediate PyBytes object. Performance (50k iterations, warmed) | Workload | Baseline | This commit | Speedup | |----------|----------|-------------|---------| | encode simple (30B, 1 string) | 0.60 us | 0.55 us | 1.09x | | encode 10-string (182B) | 1.44 us | 1.01 us | 1.43x | | encode complex (395B) | 3.02 us | 2.56 us | 1.18x | The more string fields a struct has, the larger the gain. Decode is unchanged. --- lib/py/src/ext/protocol.tcc | 16 ++++++++++- lib/py/test/thrift_TBinaryProtocol.py | 39 +++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/lib/py/src/ext/protocol.tcc b/lib/py/src/ext/protocol.tcc index 448fc6f105e..beead4e7a5d 100644 --- a/lib/py/src/ext/protocol.tcc +++ b/lib/py/src/ext/protocol.tcc @@ -456,18 +456,32 @@ bool ProtocolBase::encodeValue(PyObject* value, TType type, PyObject* type case T_STRING: { ScopedPyObject nval; + Py_ssize_t len; if (PyUnicode_Check(value)) { +#if PY_VERSION_HEX >= 0x03030000 + const char* str = PyUnicode_AsUTF8AndSize(value, &len); + if (!str) { + return false; + } + if (!detail::check_ssize_t_32(len)) { + return false; + } + + impl()->writeI32(static_cast(len)); + return writeBuffer(const_cast(str), static_cast(len)); +#else nval.reset(PyUnicode_AsUTF8String(value)); if (!nval) { return false; } +#endif } else { Py_INCREF(value); nval.reset(value); } - Py_ssize_t len = PyBytes_Size(nval.get()); + len = PyBytes_Size(nval.get()); if (!detail::check_ssize_t_32(len)) { return false; } diff --git a/lib/py/test/thrift_TBinaryProtocol.py b/lib/py/test/thrift_TBinaryProtocol.py index d4269eb6175..b7e9b62399b 100644 --- a/lib/py/test/thrift_TBinaryProtocol.py +++ b/lib/py/test/thrift_TBinaryProtocol.py @@ -22,7 +22,9 @@ import uuid import _import_local_thrift # noqa +from thrift.Thrift import TApplicationException from thrift.protocol.TBinaryProtocol import TBinaryProtocol +from thrift.protocol.TBinaryProtocol import TBinaryProtocolAcceleratedFactory from thrift.protocol.TProtocol import TProtocolException from thrift.transport import TTransport @@ -167,6 +169,16 @@ def testField(type, data): protocol.readStructEnd() +APPLICATION_EXCEPTION_TYPEARGS = [ + TApplicationException, + ( + None, + (1, 11, "message", "UTF8", None), + (2, 8, "type", None, None), + ), +] + + def testMessage(data, strict=True): message = {} message['name'] = data[0] @@ -196,6 +208,13 @@ def testMessage(data, strict=True): class TestTBinaryProtocol(unittest.TestCase): + def setUp(self): + try: + from thrift.protocol import fastbinary # noqa: F401 + self._has_fastbinary = True + except ImportError: + self._has_fastbinary = False + def test_TBinaryProtocol_write_read(self): try: testNaked('Byte', 123) @@ -280,6 +299,26 @@ def test_TBinaryProtocol_write_read(self): print("Assertion fail") raise e + def test_accelerated_utf8_roundtrip_on_application_exception(self): + if not self._has_fastbinary: + self.skipTest("C extension not available") + + original = TApplicationException( + type=TApplicationException.PROTOCOL_ERROR, + message=("snowman-\u2603-rocket-\U0001F680-" * 32), + ) + + otrans = TTransport.TMemoryBuffer() + oproto = TBinaryProtocolAcceleratedFactory(fallback=False).getProtocol(otrans) + oproto.trans.write(oproto._fast_encode(original, APPLICATION_EXCEPTION_TYPEARGS)) + + itrans = TTransport.TMemoryBuffer(otrans.getvalue()) + iproto = TBinaryProtocolAcceleratedFactory(fallback=False).getProtocol(itrans) + decoded = iproto._fast_decode(None, iproto, APPLICATION_EXCEPTION_TYPEARGS) + + self.assertEqual(decoded.message, original.message) + self.assertEqual(decoded.type, original.type) + def test_TBinaryProtocol_no_strict_write_read(self): TMessageType = {"T_CALL": 1, "T_REPLY": 2, "T_EXCEPTION": 3, "T_ONEWAY": 4} test_data = [("short message name", TMessageType['T_CALL'], 0), From 0691570336b46b4eb5f83bc24c57012de9c68f0e Mon Sep 17 00:00:00 2001 From: Mark Molinaro Date: Sat, 13 Jun 2026 07:22:49 +0000 Subject: [PATCH 2/2] python: make fastbinary writeBuffer const-correct Change the internal fastbinary writeBuffer helper to accept const input buffers so the zero-copy UTF-8 encode path does not need to const_cast CPython-managed string storage. --- lib/py/src/ext/binary.h | 2 +- lib/py/src/ext/compact.h | 2 +- lib/py/src/ext/protocol.h | 2 +- lib/py/src/ext/protocol.tcc | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/py/src/ext/binary.h b/lib/py/src/ext/binary.h index dd7750b49a8..9ccc87bc530 100644 --- a/lib/py/src/ext/binary.h +++ b/lib/py/src/ext/binary.h @@ -88,7 +88,7 @@ class BinaryProtocol : public ProtocolBase { return encodeValue(value, parsedspec.type, parsedspec.typeargs); } - void writeUuid(char* value) { + void writeUuid(const char* value) { writeBuffer(value, 16); } diff --git a/lib/py/src/ext/compact.h b/lib/py/src/ext/compact.h index 0d8946b3441..7f9c017ebd8 100644 --- a/lib/py/src/ext/compact.h +++ b/lib/py/src/ext/compact.h @@ -104,7 +104,7 @@ class CompactProtocol : public ProtocolBase { void writeFieldStop() { writeByte(0); } - void writeUuid(char* value) { + void writeUuid(const char* value) { writeBuffer(value, 16); } diff --git a/lib/py/src/ext/protocol.h b/lib/py/src/ext/protocol.h index 20911c89724..2e737239526 100644 --- a/lib/py/src/ext/protocol.h +++ b/lib/py/src/ext/protocol.h @@ -71,7 +71,7 @@ class ProtocolBase { return true; } - bool writeBuffer(char* data, size_t len); + bool writeBuffer(const char* data, size_t len); void writeByte(uint8_t val) { writeBuffer(reinterpret_cast(&val), 1); } diff --git a/lib/py/src/ext/protocol.tcc b/lib/py/src/ext/protocol.tcc index beead4e7a5d..b09fe574243 100644 --- a/lib/py/src/ext/protocol.tcc +++ b/lib/py/src/ext/protocol.tcc @@ -89,7 +89,7 @@ PyObject* ProtocolBase::getEncodedValue() { } template -inline bool ProtocolBase::writeBuffer(char* data, size_t size) { +inline bool ProtocolBase::writeBuffer(const char* data, size_t size) { if (!PycStringIO) { PycString_IMPORT; } @@ -169,7 +169,7 @@ PyObject* ProtocolBase::getEncodedValue() { } template -inline bool ProtocolBase::writeBuffer(char* data, size_t size) { +inline bool ProtocolBase::writeBuffer(const char* data, size_t size) { size_t need = size + output_->pos; if (output_->buf.capacity() < need) { try { @@ -469,7 +469,7 @@ bool ProtocolBase::encodeValue(PyObject* value, TType type, PyObject* type } impl()->writeI32(static_cast(len)); - return writeBuffer(const_cast(str), static_cast(len)); + return writeBuffer(str, static_cast(len)); #else nval.reset(PyUnicode_AsUTF8String(value)); if (!nval) {