analytics-python/segment/analytics/consumer.py at 12d2f085411ac4ed5fceee1f2f9c49666f71eb4c · segmentio/analytics-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
import logging
import time
import random
from threading import Thread
import json

from segment.analytics.request import post, APIError, DatetimeSerializer, parse_retry_after

from queue import Empty

MAX_MSG_SIZE = 32 << 10

# Our servers only accept batches less than 500KB. Here limit is set slightly
# lower to leave space for extra data that will be added later, eg. "sentAt".
BATCH_SIZE_LIMIT = 475000

# Default duration limits (12 hours in seconds)
DEFAULT_MAX_TOTAL_BACKOFF_DURATION = 43200
DEFAULT_MAX_RATE_LIMIT_DURATION = 43200


class FatalError(Exception):
    def __init__(self, message):
        self.message = message

    def __str__(self):
        msg = "[Segment] {0})"
        return msg.format(self.message)


class Consumer(Thread):
    """Consumes the messages from the client's queue."""
    log = logging.getLogger('segment')

    def __init__(self, queue, write_key, upload_size=100, host=None,
                 on_error=None, upload_interval=0.5, gzip=False, retries=10,
                 timeout=15, proxies=None, oauth_manager=None,
                 max_total_backoff_duration=DEFAULT_MAX_TOTAL_BACKOFF_DURATION,
                 max_rate_limit_duration=DEFAULT_MAX_RATE_LIMIT_DURATION):
        """Create a consumer thread."""
        Thread.__init__(self)
        # Make consumer a daemon thread so that it doesn't block program exit
        self.daemon = True
        self.upload_size = upload_size
        self.upload_interval = upload_interval
        self.write_key = write_key
        self.host = host
        self.on_error = on_error
        self.queue = queue
        self.gzip = gzip
        # It's important to set running in the constructor: if we are asked to
        # pause immediately after construction, we might set running to True in
        # run() *after* we set it to False in pause... and keep running
        # forever.
        self.running = True
        self.retries = retries
        self.timeout = timeout
        self.proxies = proxies
        self.oauth_manager = oauth_manager
        self.max_total_backoff_duration = max_total_backoff_duration
        self.max_rate_limit_duration = max_rate_limit_duration

        # Rate-limit state
        self.rate_limited_until = None
        self.rate_limit_start_time = None

    def run(self):
        """Runs the consumer."""
        self.log.debug('consumer is running...')
        while self.running:
            self.upload()

        self.log.debug('consumer exited.')

    def pause(self):
        """Pause the consumer."""
        self.running = False

    def set_rate_limit_state(self, response):
        """Set rate-limit state from a 429 response with a valid Retry-After header."""
        retry_after = parse_retry_after(response) if response is not None else None
        if retry_after is not None:
            self.rate_limited_until = time.time() + retry_after
        if self.rate_limit_start_time is None:
            self.rate_limit_start_time = time.time()

    def clear_rate_limit_state(self):
        """Clear rate-limit state after successful request or duration exceeded."""
        self.rate_limited_until = None
        self.rate_limit_start_time = None

    def upload(self):
        """Upload the next batch of items, return whether successful."""
        success = False
        batch = self.next()
        if len(batch) == 0:
            return False

        # Check rate-limit state before attempting upload
        if self.rate_limited_until is not None:
            now = time.time()

            # Check if maxRateLimitDuration has been exceeded
            if (self.rate_limit_start_time is not None and
                    now - self.rate_limit_start_time > self.max_rate_limit_duration):
                self.log.error(
                    'Rate limit duration exceeded (%ds). Clearing rate-limit state and dropping batch.',
                    self.max_rate_limit_duration
                )
                self.clear_rate_limit_state()
                # Drop the batch by marking items as done
                if self.on_error:
                    self.on_error(
                        Exception('Rate limit duration exceeded, batch dropped'),
                        batch
                    )
                for _ in batch:
                    self.queue.task_done()
                return False

            # Still rate-limited; wait until the rate limit expires
            wait_time = self.rate_limited_until - now
            if wait_time > 0:
                self.log.debug(
                    'Rate-limited. Waiting %.2fs before next upload attempt.',
                    wait_time
                )
                time.sleep(wait_time)

        try:
            self.request(batch)
            # Success — clear rate-limit state
            self.clear_rate_limit_state()
            success = True
        except APIError as e:
            if e.status == 429 and self.rate_limited_until is not None:
                # 429: rate-limit state already set by request(). Re-queue batch.
                self.log.debug('429 received. Re-queuing batch and halting upload iteration.')
                for item in batch:
                    try:
                        self.queue.put(item, block=False)
                    except Exception:
                        pass  # Queue full, item lost
                success = False
            else:
                self.log.error('error uploading: %s', e)
                success = False
                if self.on_error:
                    self.on_error(e, batch)
        except Exception as e:
            self.log.error('error uploading: %s', e)
            success = False
            if self.on_error:
                self.on_error(e, batch)
        finally:
            # mark items as acknowledged from queue
            for _ in batch:
                self.queue.task_done()
        return success

    def next(self):
        """Return the next batch of items to upload."""
        queue = self.queue
        items = []

        start_time = time.monotonic()
        total_size = 0

        while len(items) < self.upload_size:
            elapsed = time.monotonic() - start_time
            if elapsed >= self.upload_interval:
                break
            try:
                item = queue.get(
                    block=True, timeout=self.upload_interval - elapsed)
                item_size = len(json.dumps(
                    item, cls=DatetimeSerializer).encode())
                if item_size > MAX_MSG_SIZE:
                    self.log.error(
                        'Item exceeds 32kb limit, dropping. (%s)', str(item))
                    continue
                items.append(item)
                total_size += item_size
                if total_size >= BATCH_SIZE_LIMIT:
                    self.log.debug(
                        'hit batch size limit (size: %d)', total_size)
                    break
            except Empty:
                break
            except Exception as e:
                self.log.exception('Exception: %s', e)

        return items

    def request(self, batch):
        """Attempt to upload the batch and retry before raising an error"""

        def is_retryable_status(status):
            """
            Determine if a status code is retryable.
            Retryable 4xx: 408, 410, 429, 460
            Non-retryable 4xx: 400, 401, 403, 404, 413, 422, and all other 4xx
            Retryable 5xx: All except 501, 505
              - 511 is only retryable when OauthManager is configured
            Non-retryable 5xx: 501, 505
            """
            if 400 <= status < 500:
                return status in (408, 410, 429, 460)
            elif 500 <= status < 600:
                if status in (501, 505):
                    return False
                if status == 511:
                    return self.oauth_manager is not None
                return True
            return False

        def calculate_backoff_delay(attempt):
            """
            Calculate exponential backoff delay with jitter.
            First retry is immediate, then 0.5s, 1s, 2s, 4s, etc.
            """
            if attempt == 1:
                return 0  # First retry is immediate
            base_delay = 0.5 * (2 ** (attempt - 2))
            jitter = random.uniform(0, 0.1 * base_delay)
            return min(base_delay + jitter, 60)  # Cap at 60 seconds

        total_attempts = 0
        backoff_attempts = 0
        first_failure_time = None

        while True:
            total_attempts += 1

            try:
                # Make the request with current retry count
                response = post(
                    self.write_key,
                    self.host,
                    gzip=self.gzip,
                    timeout=self.timeout,
                    batch=batch,
                    proxies=self.proxies,
                    oauth_manager=self.oauth_manager,
                    retry_count=total_attempts - 1
                )
                # Success
                return response

            except FatalError as e:
                # Non-retryable error
                self.log.error(f"Fatal error after {total_attempts} attempts: {e}")
                raise

            except APIError as e:
                # 429 with valid Retry-After: set rate-limit state and raise
                # to caller (pipeline blocking). Without Retry-After, fall
                # through to counted backoff like any other retryable error.
                if e.status == 429:
                    retry_after = parse_retry_after(e.response) if e.response is not None else None
                    if retry_after is not None:
                        self.set_rate_limit_state(e.response)
                        raise

                # Check if status is retryable
                if not is_retryable_status(e.status):
                    self.log.error(
                        f"Non-retryable error {e.status} after {total_attempts} attempts: {e}"
                    )
                    raise

                # Transient error -- per-batch backoff
                if first_failure_time is None:
                    first_failure_time = time.time()
                if time.time() - first_failure_time > self.max_total_backoff_duration:
                    self.log.error(
                        f"Max total backoff duration ({self.max_total_backoff_duration}s) exceeded "
                        f"after {total_attempts} attempts. Final error: {e}"
                    )
                    raise

                # Count this against backoff attempts
                backoff_attempts += 1
                if backoff_attempts >= self.retries + 1:
                    self.log.error(
                        f"All {self.retries} retries exhausted after {total_attempts} total attempts. Final error: {e}"
                    )
                    raise

                # Calculate exponential backoff delay with jitter
                delay = calculate_backoff_delay(backoff_attempts)

                self.log.debug(
                    f"Retry attempt {backoff_attempts}/{self.retries} (total attempts: {total_attempts}) "
                    f"after {delay:.2f}s for status {e.status}"
                )
                time.sleep(delay)

            except Exception as e:
                # Network errors or other exceptions - retry with backoff
                if first_failure_time is None:
                    first_failure_time = time.time()
                if time.time() - first_failure_time > self.max_total_backoff_duration:
                    self.log.error(
                        f"Max total backoff duration ({self.max_total_backoff_duration}s) exceeded "
                        f"after {total_attempts} attempts. Final error: {e}"
                    )
                    raise

                backoff_attempts += 1

                if backoff_attempts >= self.retries + 1:
                    self.log.error(
                        f"All {self.retries} retries exhausted after {total_attempts} total attempts. Final error: {e}"
                    )
                    raise

                # Calculate exponential backoff delay with jitter
                delay = calculate_backoff_delay(backoff_attempts)

                self.log.debug(
                    f"Network error retry {backoff_attempts}/{self.retries} (total attempts: {total_attempts}) "
                    f"after {delay:.2f}s: {e}"
                )
                time.sleep(delay)