Skip to content

Commit d778ee0

Browse files
committed
2.0.5: Reuse PHP's interned 1-character and empty strings
Because we're using the simdjson C module, we know the length before we need to allocate strings, so this is cheap to check. This saves time in the following areas for 0 and 1 byte strings: - PHP doesn't need to allocate a temporary string and initialize it - PHP doesn't need to free the temporary string - PHP doesn't need to compute the hash of the temporary string - String comparisons are faster when strings are the exact same interned string - Memory usage is reduced because the string representation is reused - CPU caches may already have this interned string - If all array keys are interned strings, then php can skip the step of freeing array keys when garbage collecting the array.
1 parent 3767ea6 commit d778ee0

6 files changed

Lines changed: 222 additions & 22 deletions

File tree

benchmark/composer.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
"minimum-stability": "dev",
77
"prefer-stable": true,
88
"require": {
9-
"php": "^7.1"
9+
"php": "^7.1|^8.2"
1010
},
1111
"require-dev": {
12-
"phpbench/phpbench": "^0.17.1"
12+
"phpbench/phpbench": "^1.2.6"
1313
},
1414
"autoload-dev": {
1515
"psr-4": {
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace SimdjsonBench;
6+
7+
use PhpBench\Benchmark\Metadata\Annotations\Subject;
8+
9+
if (!extension_loaded('simdjson')) {
10+
exit;
11+
}
12+
13+
/**
14+
* This tests decoding an array with 100 objects where keys are single-char strings and empty strings.
15+
*
16+
* In simdjson, those are optimized for memory usage in php 7.2+ by using php's interned string constants instead of brand new strings.
17+
* (json_decode doesn't do that but could - when using the simdjson C module it's quicker since the length is known before we need to allocate memory for the internal PHP string representation (zend_string))
18+
*
19+
* Note that in this benchmark, PHP garbage collecting the array returned by simdjson_decode can skip over the array keys,
20+
* because none of the keys were reference counted.
21+
*
22+
* @Revs(5)
23+
* @Iterations(5)
24+
* @Warmup(3)
25+
* @OutputTimeUnit("milliseconds", precision=5)
26+
* @BeforeMethods({"init"})
27+
* @Groups({"decode"})
28+
*/
29+
class SingleCharStringsBench
30+
{
31+
32+
/**
33+
* @var string
34+
*/
35+
private $json;
36+
37+
public function init(): void
38+
{
39+
$raw = [];
40+
for ($i = 0; $i < 100; $i++) {
41+
$raw[] = ['x' => chr(48 + $i % 10), 'y' => chr(48 + $i % 8), 's' => ''];
42+
}
43+
$this->json = \json_encode($raw);
44+
}
45+
46+
/**
47+
* @Subject()
48+
*/
49+
public function jsonDecodeAssoc(): void
50+
{
51+
$data = \json_decode($this->json, true);
52+
53+
if ('0' !== $data[0]['x']) {
54+
throw new \RuntimeException('error');
55+
}
56+
}
57+
58+
/**
59+
* @Subject()
60+
*/
61+
public function jsonDecode(): void
62+
{
63+
$data = \json_decode($this->json, false);
64+
65+
if ('0' !== $data[0]->x) {
66+
throw new \RuntimeException('error');
67+
}
68+
}
69+
70+
/**
71+
* @Subject()
72+
*/
73+
public function simdjsonDecodeAssoc()
74+
{
75+
$data = \simdjson_decode($this->json, true);
76+
77+
if ('0' !== $data[0]['x']) {
78+
throw new \RuntimeException('error');
79+
}
80+
}
81+
82+
/**
83+
* @Subject()
84+
*/
85+
public function simdjsonDecode()
86+
{
87+
$data = \simdjson_decode($this->json, false);
88+
89+
if ('0' !== $data[0]->x) {
90+
throw new \RuntimeException('error');
91+
}
92+
}
93+
94+
}

package.xml

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,16 @@
2020
-->
2121
<date>2022-10-01</date>
2222
<version>
23-
<release>2.0.4</release>
24-
<api>2.0.4</api>
23+
<release>2.0.5</release>
24+
<api>2.0.5</api>
2525
</version>
2626
<stability>
2727
<release>stable</release>
2828
<api>stable</api>
2929
</stability>
3030
<license uri="https://www.apache.org/licenses/LICENSE-2.0.html">Apache 2.0</license>
3131
<notes>
32-
* Add `-fvisibility=hidden` to compiler options to reduce compiled extension size by avoiding exporting symbols by default.
33-
* If the requested json parsing $depth is excessively large when reallocating larger buffers for the C simdjson parser,
34-
then internally use a smaller $depth that would behave identically with lower memory usage. (#66)
35-
* Update simdjson to properly reject surrogate pairs with an invalid low surrogate. (https://en.wikipedia.org/wiki/UTF-16)
32+
* Reuse PHP's 1-byte and 0-byte interned strings in simdjson_decode, reducing memory usage for those strings. (e.g. for the key/value in '{"x":""}')
3633
</notes>
3734
<contents>
3835
<dir name="/">
@@ -59,6 +56,7 @@
5956
<file name="decode_invalid_property.phpt" role="test"/>
6057
<file name="decode_max_depth.phpt" role="test"/>
6158
<file name="decode_max_depth_memory_reduction.phpt" role="test"/>
59+
<file name="decode_repeat.phpt" role="test"/>
6260
<file name="decode_result.phpt" role="test"/>
6361
<file name="decode_strict_types.phpt" role="test"/>
6462
<file name="decode_types.phpt" role="test"/>
@@ -93,6 +91,23 @@
9391
<providesextension>simdjson</providesextension>
9492
<extsrcrelease/>
9593
<changelog>
94+
<date>2022-10-01</date>
95+
<version>
96+
<release>2.0.4</release>
97+
<api>2.0.4</api>
98+
</version>
99+
<stability>
100+
<release>stable</release>
101+
<api>stable</api>
102+
</stability>
103+
<license uri="https://www.apache.org/licenses/LICENSE-2.0.html">Apache 2.0</license>
104+
<notes>
105+
* Add `-fvisibility=hidden` to compiler options to reduce compiled extension size by avoiding exporting symbols by default.
106+
* If the requested json parsing $depth is excessively large when reallocating larger buffers for the C simdjson parser,
107+
then internally use a smaller $depth that would behave identically with lower memory usage. (#66)
108+
* Update simdjson to fix handling of surrogate pairs with invalid low surrogate.
109+
</notes>
110+
</release>
96111
<release>
97112
<date>2022-08-30</date>
98113
<version>

php_simdjson.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
extern zend_module_entry simdjson_module_entry;
1818
#define phpext_simdjson_ptr &simdjson_module_entry
1919

20-
#define PHP_SIMDJSON_VERSION "2.0.4"
20+
#define PHP_SIMDJSON_VERSION "2.0.5"
2121
#define SIMDJSON_SUPPORT_URL "https://github.com/crazyxman/simdjson_php"
2222
#define SIMDJSON_PARSE_FAIL 0
2323
#define SIMDJSON_PARSE_SUCCESS 1
@@ -26,9 +26,6 @@ extern zend_module_entry simdjson_module_entry;
2626

2727
#define SIMDJSON_PARSE_DEFAULT_DEPTH 512
2828

29-
#define SIMDJSON_RESOUCE_PJH_TYPE 3
30-
#define SIMDJSON_RESOUCE_PJ_TYPE 4
31-
3229

3330
extern PHPAPI void php_var_dump(zval **struc, int level);
3431
extern PHPAPI void php_debug_zval_dump(zval **struc, int level);

src/bindings.cpp

Lines changed: 69 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,46 @@ build_parsed_json_cust(simdjson::dom::parser& parser, simdjson::dom::element &do
7171
return simdjson::SUCCESS;
7272
}
7373

74-
/* }}} */
74+
static zend_always_inline void simdjson_set_zval_to_string(zval *v, const char *buf, size_t len) {
75+
/* In php 7.1, the ZSTR_CHAR macro doesn't exist, and CG(one_char_string)[chr] may or may not be null */
76+
#if PHP_VERSION_ID >= 70200
77+
if (len <= 1) {
78+
/*
79+
A note on performance benefits of the use of interned strings here and elsewhere:
80+
81+
- PHP doesn't need to allocate a temporary string and initialize it
82+
- PHP doesn't need to free the temporary string
83+
- PHP doesn't need to compute the hash of the temporary string
84+
- Memory usage is reduced because the string representation is reused
85+
- String comparisons are faster when the strings are the exact same pointer.
86+
- CPU caches may already have this interned string
87+
- If all array keys are interned strings, then php can skip the step of
88+
freeing array keys when garbage collecting the array.
89+
*/
90+
zend_string *key = len == 1 ? ZSTR_CHAR(buf[0]) : ZSTR_EMPTY_ALLOC();
91+
ZVAL_INTERNED_STR(v, key);
92+
return;
93+
}
94+
#endif
95+
ZVAL_STRINGL(v, buf, len);
96+
}
97+
98+
static zend_always_inline void simdjson_add_key_to_symtable(HashTable *ht, const char *buf, size_t len, zval *value) {
99+
#if PHP_VERSION_ID >= 70200
100+
if (len <= 1) {
101+
/* Look up the interned string (i.e. not reference counted) */
102+
zend_string *key = len == 1 ? ZSTR_CHAR(buf[0]) : ZSTR_EMPTY_ALLOC();
103+
/* Add the key or update the existing value of the key. */
104+
zend_symtable_update(ht, key, value);
105+
/* zend_string_release_ex is a no-op for interned strings */
106+
return;
107+
}
108+
#endif
109+
zend_string *key = zend_string_init(buf, len, 0);
110+
zend_symtable_update(ht, key, value);
111+
/* Release the reference counted key */
112+
zend_string_release_ex(key, 0);
113+
}
75114

76115
static zend_always_inline void simdjson_set_zval_to_int64(zval *zv, const int64_t value) {
77116
#if SIZEOF_ZEND_LONG < 8
@@ -89,7 +128,7 @@ static zval create_array(simdjson::dom::element element) /* {{{ */ {
89128
switch (element.type()) {
90129
//ASCII sort
91130
case simdjson::dom::element_type::STRING :
92-
ZVAL_STRINGL(&v, element.get_c_str().value_unsafe(), element.get_string_length().value_unsafe());
131+
simdjson_set_zval_to_string(&v, element.get_c_str().value_unsafe(), element.get_string_length().value_unsafe());
93132
break;
94133
case simdjson::dom::element_type::INT64 :
95134
simdjson_set_zval_to_int64(&v, element.get_int64().value_unsafe());
@@ -141,9 +180,7 @@ static zval create_array(simdjson::dom::element element) /* {{{ */ {
141180
for (simdjson::dom::key_value_pair field : json_object) {
142181
zval value = create_array(field.value);
143182
/* TODO consider using zend_string_init_existing_interned in php 8.1+ to save memory and time freeing strings. */
144-
zend_string *key = zend_string_init(field.key.data(), field.key.size(), 0);
145-
zend_symtable_update(arr, key, &value);
146-
zend_string_release_ex(key, 0);
183+
simdjson_add_key_to_symtable(arr, field.key.data(), field.key.size(), &value);
147184
}
148185
break;
149186
}
@@ -161,7 +198,7 @@ static zval create_object(simdjson::dom::element element) /* {{{ */ {
161198
switch (element.type()) {
162199
//ASCII sort
163200
case simdjson::dom::element_type::STRING :
164-
ZVAL_STRINGL(&v, element.get_c_str().value_unsafe(), element.get_string_length().value_unsafe());
201+
simdjson_set_zval_to_string(&v, element.get_c_str().value_unsafe(), element.get_string_length().value_unsafe());
165202
break;
166203
case simdjson::dom::element_type::INT64 :
167204
simdjson_set_zval_to_int64(&v, element.get_int64().value_unsafe());
@@ -214,14 +251,36 @@ static zval create_object(simdjson::dom::element element) /* {{{ */ {
214251
return v;
215252
}
216253
zval value = create_object(field.value);
254+
255+
/* Add the key to the object */
217256
#if PHP_VERSION_ID >= 80000
218-
/* TODO consider using zend_string_init_existing_interned in php 8.1+ to save memory and time freeing strings. */
219-
zend_string *key = zend_string_init(data, size, 0);
220-
obj->handlers->write_property(obj, key, &value, NULL);
257+
zend_string *key;
258+
if (size <= 1) {
259+
key = size == 1 ? ZSTR_CHAR(data[0]) : ZSTR_EMPTY_ALLOC();
260+
} else {
261+
key = zend_string_init(data, size, 0);
262+
}
263+
zend_std_write_property(obj, key, &value, NULL);
221264
zend_string_release_ex(key, 0);
222265
#else
223-
add_property_zval_ex(&v, data, size, &value);
266+
267+
# if PHP_VERSION_ID >= 70200
268+
if (size <= 1) {
269+
zval zkey;
270+
zend_string *key = size == 1 ? ZSTR_CHAR(data[0]) : ZSTR_EMPTY_ALLOC();
271+
ZVAL_INTERNED_STR(&zkey, key);
272+
zend_std_write_property(&v, &zkey, &value, NULL);
273+
} else
274+
# endif
275+
{
276+
zval zkey;
277+
ZVAL_STRINGL(&zkey, data, size);
278+
zend_std_write_property(&v, &zkey, &value, NULL);
279+
zval_ptr_dtor_nogc(&zkey);
280+
}
224281
#endif
282+
/* After the key is added to the object (incrementing the reference count) ,
283+
* decrement the reference count of the value by one */
225284
zval_ptr_dtor_nogc(&value);
226285
}
227286
break;

tests/decode_repeat.phpt

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
--TEST--
2+
simdjson_decode repeated keys use the last value
3+
--FILE--
4+
<?php
5+
// Repeated
6+
$json = '{"a":"b","a":"c","0":[1],"0":[2],"foo":"bar","foo":"baz"}';
7+
$value = \simdjson_decode($json, false);
8+
var_dump($value);
9+
10+
$value = \simdjson_decode($json, true);
11+
var_dump($value);
12+
?>
13+
--EXPECT--
14+
object(stdClass)#1 (3) {
15+
["a"]=>
16+
string(1) "c"
17+
["0"]=>
18+
array(1) {
19+
[0]=>
20+
int(2)
21+
}
22+
["foo"]=>
23+
string(3) "baz"
24+
}
25+
array(3) {
26+
["a"]=>
27+
string(1) "c"
28+
[0]=>
29+
array(1) {
30+
[0]=>
31+
int(2)
32+
}
33+
["foo"]=>
34+
string(3) "baz"
35+
}

0 commit comments

Comments
 (0)