This repository was archived by the owner on Jan 20, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathiterator.go
More file actions
454 lines (389 loc) · 13.1 KB
/
iterator.go
File metadata and controls
454 lines (389 loc) · 13.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
package pebbledb
import (
"bytes"
"fmt"
"time"
"github.com/cockroachdb/pebble"
"github.com/sei-protocol/sei-db/ss/types"
"golang.org/x/exp/slices"
)
var _ types.DBIterator = (*iterator)(nil)
// iterator implements the Iterator interface. It wraps a PebbleDB iterator
// with added MVCC key handling logic. The iterator will iterate over the key space
// in the provided domain for a given version. If a key has been written at the
// provided version, that key/value pair will be iterated over. Otherwise, the
// latest version for that key/value pair will be iterated over s.t. it's less
// than the provided version. Note:
//
// - The start key must not be empty.
// - Currently, reverse iteration is NOT supported.
type iterator struct {
source *pebble.Iterator
prefix, start, end []byte
version int64
valid bool
reverse bool
}
func newPebbleDBIterator(src *pebble.Iterator, prefix, mvccStart, mvccEnd []byte, version int64, earliestVersion int64, reverse bool) *iterator {
var startStr, endStr string
if mvccStart != nil {
startStr = string(mvccStart)
}
if mvccEnd != nil {
endStr = string(mvccEnd)
}
fmt.Printf("[%d] SSDEBUG - newPebbleDBIterator prefix %s (length %d) mvccStart %s (length %d) mvccEnd %s (length %d) version %d earliestVersion %d reverse %t\n", time.Now().UnixNano(), string(prefix), len(prefix), startStr, len(mvccStart), endStr, len(mvccEnd), version, earliestVersion, reverse)
// Return invalid iterator if requested iterator height is lower than earliest version after pruning
if version < earliestVersion {
return &iterator{
source: src,
prefix: prefix,
start: mvccStart,
end: mvccEnd,
version: version,
valid: false,
reverse: reverse,
}
}
// move the underlying PebbleDB iterator to the first key
var valid bool
if reverse {
valid = src.Last()
} else {
valid = src.First()
}
itr := &iterator{
source: src,
prefix: prefix,
start: mvccStart,
end: mvccEnd,
version: version,
valid: valid,
reverse: reverse,
}
if valid {
currKey, currKeyVersion, ok := SplitMVCCKey(itr.source.Key())
if !ok {
// XXX: This should not happen as that would indicate we have a malformed MVCC key.
panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key()))
}
curKeyVersionDecoded, err := decodeUint64Ascending(currKeyVersion)
if err != nil {
itr.valid = false
return itr
}
// We need to check whether initial key iterator visits has a version <= requested version
// If larger version, call next to find another key which does
if curKeyVersionDecoded > itr.version {
itr.Next()
} else {
// If version is less, seek to the largest version of that key <= requested iterator version
// It is guaranteed this won't move the iterator to a key that is invalid since
// curKeyVersionDecoded <= requested iterator version, so there exists at least one version of currKey SeekLT may move to
itr.valid = itr.source.SeekLT(MVCCEncode(currKey, itr.version+1))
}
}
// Make sure we skip to the next key if the current one is tombstone
if valTombstoned(itr.source.Value()) {
if reverse {
itr.nextReverse()
} else {
itr.nextForward()
}
}
return itr
}
// Domain returns the domain of the iterator. The caller must not modify the
// return values.
func (itr *iterator) Domain() ([]byte, []byte) {
fmt.Printf("[%d] SSDEBUG - iterator.Domain\n", time.Now().UnixNano())
return itr.start, itr.end
}
func (itr *iterator) Key() []byte {
fmt.Printf("[%d] SSDEBUG - iterator.Key\n", time.Now().UnixNano())
itr.assertIsValid()
key, _, ok := SplitMVCCKey(itr.source.Key())
if !ok {
// XXX: This should not happen as that would indicate we have a malformed
// MVCC key.
panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key()))
}
keyCopy := slices.Clone(key)
result := keyCopy[len(itr.prefix):]
fmt.Printf("[%d] SSDEBUG - iterator.Key returning key %s (length %d)\n", time.Now().UnixNano(), string(result), len(result))
return result
}
func (itr *iterator) Value() []byte {
fmt.Printf("[%d] SSDEBUG - iterator.Value\n", time.Now().UnixNano())
itr.assertIsValid()
val, _, ok := SplitMVCCKey(itr.source.Value())
if !ok {
// XXX: This should not happen as that would indicate we have a malformed
// MVCC value.
panic(fmt.Sprintf("invalid PebbleDB MVCC value: %s", itr.source.Key()))
}
result := slices.Clone(val)
fmt.Printf("[%d] SSDEBUG - iterator.Value returning value %s (length %d)\n", time.Now().UnixNano(), string(result), len(result))
return result
}
func (itr *iterator) nextForward() {
fmt.Printf("[%d] SSDEBUG - iterator.nextForward\n", time.Now().UnixNano())
if !itr.source.Valid() {
itr.valid = false
return
}
currKey, _, ok := SplitMVCCKey(itr.source.Key())
if !ok {
// XXX: This should not happen as that would indicate we have a malformed
// MVCC key.
panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key()))
}
next := itr.source.NextPrefix()
// First move the iterator to the next prefix, which may not correspond to the
// desired version for that key, e.g. if the key was written at a later version,
// so we seek back to the latest desired version, s.t. the version is <= itr.version.
if next {
nextKey, _, ok := SplitMVCCKey(itr.source.Key())
if !ok {
// XXX: This should not happen as that would indicate we have a malformed
// MVCC key.
itr.valid = false
return
}
if !bytes.HasPrefix(nextKey, itr.prefix) {
// the next key must have itr.prefix as the prefix
itr.valid = false
return
}
// Move the iterator to the closest version to the desired version, so we
// append the current iterator key to the prefix and seek to that key.
itr.valid = itr.source.SeekLT(MVCCEncode(nextKey, itr.version+1))
tmpKey, tmpKeyVersion, ok := SplitMVCCKey(itr.source.Key())
if !ok {
// XXX: This should not happen as that would indicate we have a malformed
// MVCC key.
itr.valid = false
return
}
// There exists cases where the SeekLT() call moved us back to the same key
// we started at, so we must move to next key, i.e. two keys forward.
if bytes.Equal(tmpKey, currKey) {
if itr.source.NextPrefix() {
itr.nextForward()
_, tmpKeyVersion, ok = SplitMVCCKey(itr.source.Key())
if !ok {
// XXX: This should not happen as that would indicate we have a malformed
// MVCC key.
itr.valid = false
return
}
} else {
itr.valid = false
return
}
}
// We need to verify that every Next call either moves the iterator to a key whose version
// is less than or equal to requested iterator version, or exhausts the iterator
tmpKeyVersionDecoded, err := decodeUint64Ascending(tmpKeyVersion)
if err != nil {
itr.valid = false
return
}
// If iterator is at a entry whose version is higher than requested version, call nextForward again
if tmpKeyVersionDecoded > itr.version {
itr.nextForward()
}
// The cursor might now be pointing at a key/value pair that is tombstoned.
// If so, we must move the cursor.
if itr.valid && itr.cursorTombstoned() {
itr.nextForward()
}
return
}
itr.valid = false
}
func (itr *iterator) nextReverse() {
fmt.Printf("[%d] SSDEBUG - iterator.nextReverse\n", time.Now().UnixNano())
if !itr.source.Valid() {
itr.valid = false
return
}
currKey, _, ok := SplitMVCCKey(itr.source.Key())
if !ok {
// XXX: This should not happen as that would indicate we have a malformed
// MVCC key.
panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key()))
}
next := itr.source.SeekLT(MVCCEncode(currKey, 0))
// First move the iterator to the next prefix, which may not correspond to the
// desired version for that key, e.g. if the key was written at a later version,
// so we seek back to the latest desired version, s.t. the version is <= itr.version.
if next {
nextKey, _, ok := SplitMVCCKey(itr.source.Key())
if !ok {
// XXX: This should not happen as that would indicate we have a malformed
// MVCC key.
itr.valid = false
return
}
if !bytes.HasPrefix(nextKey, itr.prefix) {
// the next key must have itr.prefix as the prefix
itr.valid = false
return
}
// Move the iterator to the closest version to the desired version, so we
// append the current iterator key to the prefix and seek to that key.
itr.valid = itr.source.SeekLT(MVCCEncode(nextKey, itr.version+1))
_, tmpKeyVersion, ok := SplitMVCCKey(itr.source.Key())
if !ok {
// XXX: This should not happen as that would indicate we have a malformed
// MVCC key.
itr.valid = false
return
}
// We need to verify that every Next call either moves the iterator to a key whose version
// is less than or equal to requested iterator version, or exhausts the iterator
tmpKeyVersionDecoded, err := decodeUint64Ascending(tmpKeyVersion)
if err != nil {
itr.valid = false
return
}
// If iterator is at a entry whose version is higher than requested version, call nextReverse again
if tmpKeyVersionDecoded > itr.version {
itr.nextReverse()
}
// The cursor might now be pointing at a key/value pair that is tombstoned.
// If so, we must move the cursor.
if itr.valid && itr.cursorTombstoned() {
itr.nextReverse()
}
return
}
itr.valid = false
}
func (itr *iterator) Next() {
fmt.Printf("[%d] SSDEBUG - iterator.Next reverse %t\n", time.Now().UnixNano(), itr.reverse)
if itr.reverse {
itr.nextReverse()
} else {
itr.nextForward()
}
}
func (itr *iterator) Valid() bool {
fmt.Printf("[%d] SSDEBUG - iterator.Valid\n", time.Now().UnixNano())
// once invalid, forever invalid
if !itr.valid || !itr.source.Valid() {
itr.valid = false
return itr.valid
}
// if source has error, consider it invalid
if err := itr.source.Error(); err != nil {
itr.valid = false
return itr.valid
}
// if key is at the end or past it, consider it invalid
if end := itr.end; end != nil {
if bytes.Compare(end, itr.Key()) <= 0 {
itr.valid = false
return itr.valid
}
}
return true
}
func (itr *iterator) Error() error {
fmt.Printf("[%d] SSDEBUG - iterator.Error\n", time.Now().UnixNano())
return itr.source.Error()
}
func (itr *iterator) Close() error {
fmt.Printf("[%d] SSDEBUG - iterator.Close\n", time.Now().UnixNano())
_ = itr.source.Close()
itr.source = nil
itr.valid = false
return nil
}
func (itr *iterator) assertIsValid() {
if !itr.valid {
panic("iterator is invalid")
}
}
// cursorTombstoned checks if the current cursor is pointing at a key/value pair
// that is tombstoned. If the cursor is tombstoned, <true> is returned, otherwise
// <false> is returned. In the case where the iterator is valid but the key/value
// pair is tombstoned, the caller should call Next(). Note, this method assumes
// the caller assures the iterator is valid first!
func (itr *iterator) cursorTombstoned() bool {
fmt.Printf("[%d] SSDEBUG - iterator.cursorTombstoned\n", time.Now().UnixNano())
_, tombBz, ok := SplitMVCCKey(itr.source.Value())
if !ok {
// XXX: This should not happen as that would indicate we have a malformed
// MVCC value.
panic(fmt.Sprintf("invalid PebbleDB MVCC value: %s", itr.source.Key()))
}
// If the tombstone suffix is empty, we consider this a zero value and thus it
// is not tombstoned.
if len(tombBz) == 0 {
return false
}
// If the tombstone suffix is non-empty and greater than the target version,
// the value is not tombstoned.
tombstone, err := decodeUint64Ascending(tombBz)
if err != nil {
panic(fmt.Errorf("failed to decode value tombstone: %w", err))
}
if tombstone > itr.version {
return false
}
return true
}
func (itr *iterator) DebugRawIterate() {
fmt.Printf("[%d] SSDEBUG - iterator.DebugRawIterate\n", time.Now().UnixNano())
valid := itr.source.Valid()
if valid {
// The first key may not represent the desired target version, so move the
// cursor to the correct location.
firstKey, _, _ := SplitMVCCKey(itr.source.Key())
valid = itr.source.SeekLT(MVCCEncode(firstKey, itr.version+1))
}
for valid {
key, vBz, ok := SplitMVCCKey(itr.source.Key())
if !ok {
panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key()))
}
version, err := decodeUint64Ascending(vBz)
if err != nil {
panic(fmt.Errorf("failed to decode key version: %w", err))
}
val, tombBz, ok := SplitMVCCKey(itr.source.Value())
if !ok {
panic(fmt.Sprintf("invalid PebbleDB MVCC value: %s", itr.source.Value()))
}
var tombstone int64
if len(tombBz) > 0 {
tombstone, err = decodeUint64Ascending(vBz)
if err != nil {
panic(fmt.Errorf("failed to decode value tombstone: %w", err))
}
}
fmt.Printf("KEY: %s, VALUE: %s, VERSION: %d, TOMBSTONE: %d\n", key, val, version, tombstone)
var next bool
if itr.reverse {
next = itr.source.SeekLT(MVCCEncode(key, 0))
} else {
next = itr.source.NextPrefix()
}
if next {
nextKey, _, ok := SplitMVCCKey(itr.source.Key())
if !ok {
panic(fmt.Sprintf("invalid PebbleDB MVCC key: %s", itr.source.Key()))
}
// the next key must have itr.prefix as the prefix
if !bytes.HasPrefix(nextKey, itr.prefix) {
valid = false
} else {
valid = itr.source.SeekLT(MVCCEncode(nextKey, itr.version+1))
}
} else {
valid = false
}
}
}