analytics-python/segment/analytics/consumer.py at e4f34d0d377bb608b42537f31aeb061f6ec5d5b5 · segmentio/analytics-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import logging
import time
import random
from threading import Thread
import json

from segment.analytics.request import post, APIError, DatetimeSerializer, parse_retry_after

from queue import Empty

MAX_MSG_SIZE = 32 << 10

# Our servers only accept batches less than 500KB. Here limit is set slightly
# lower to leave space for extra data that will be added later, eg. "sentAt".
BATCH_SIZE_LIMIT = 475000


class FatalError(Exception):
    def __init__(self, message):
        self.message = message

    def __str__(self):
        msg = "[Segment] {0})"
        return msg.format(self.message)


class Consumer(Thread):
    """Consumes the messages from the client's queue."""
    log = logging.getLogger('segment')

    def __init__(self, queue, write_key, upload_size=100, host=None,
                 on_error=None, upload_interval=0.5, gzip=False, retries=1000,
                 timeout=15, proxies=None, oauth_manager=None):
        """Create a consumer thread."""
        Thread.__init__(self)
        # Make consumer a daemon thread so that it doesn't block program exit
        self.daemon = True
        self.upload_size = upload_size
        self.upload_interval = upload_interval
        self.write_key = write_key
        self.host = host
        self.on_error = on_error
        self.queue = queue
        self.gzip = gzip
        # It's important to set running in the constructor: if we are asked to
        # pause immediately after construction, we might set running to True in
        # run() *after* we set it to False in pause... and keep running
        # forever.
        self.running = True
        self.retries = retries
        self.timeout = timeout
        self.proxies = proxies
        self.oauth_manager = oauth_manager

    def run(self):
        """Runs the consumer."""
        self.log.debug('consumer is running...')
        while self.running:
            self.upload()

        self.log.debug('consumer exited.')

    def pause(self):
        """Pause the consumer."""
        self.running = False

    def upload(self):
        """Upload the next batch of items, return whether successful."""
        success = False
        batch = self.next()
        if len(batch) == 0:
            return False

        try:
            self.request(batch)
            success = True
        except Exception as e:
            self.log.error('error uploading: %s', e)
            success = False
            if self.on_error:
                self.on_error(e, batch)
        finally:
            # mark items as acknowledged from queue
            for _ in batch:
                self.queue.task_done()
        return success

    def next(self):
        """Return the next batch of items to upload."""
        queue = self.queue
        items = []

        start_time = time.monotonic()
        total_size = 0

        while len(items) < self.upload_size:
            elapsed = time.monotonic() - start_time
            if elapsed >= self.upload_interval:
                break
            try:
                item = queue.get(
                    block=True, timeout=self.upload_interval - elapsed)
                item_size = len(json.dumps(
                    item, cls=DatetimeSerializer).encode())
                if item_size > MAX_MSG_SIZE:
                    self.log.error(
                        'Item exceeds 32kb limit, dropping. (%s)', str(item))
                    continue
                items.append(item)
                total_size += item_size
                if total_size >= BATCH_SIZE_LIMIT:
                    self.log.debug(
                        'hit batch size limit (size: %d)', total_size)
                    break
            except Empty:
                break
            except Exception as e:
                self.log.exception('Exception: %s', e)

        return items

    def request(self, batch):
        """Attempt to upload the batch and retry before raising an error"""

        def is_retryable_status(status):
            """
            Determine if a status code is retryable.
            Retryable 4xx: 408, 410, 429, 460
            Non-retryable 4xx: 400, 401, 403, 404, 413, 422, and all other 4xx
            Retryable 5xx: All except 501, 505
            Non-retryable 5xx: 501, 505
            """
            if 400 <= status < 500:
                return status in (408, 410, 429, 460)
            elif 500 <= status < 600:
                return status not in (501, 505)
            return False

        def should_use_retry_after(status):
            """Check if status code should respect Retry-After header"""
            return status in (408, 429, 503)

        total_attempts = 0
        backoff_attempts = 0
        max_backoff_attempts = self.retries + 1

        while True:
            try:
                # Make the request with current retry count
                response = post(
                    self.write_key,
                    self.host,
                    gzip=self.gzip,
                    timeout=self.timeout,
                    batch=batch,
                    proxies=self.proxies,
                    oauth_manager=self.oauth_manager,
                    retry_count=total_attempts
                )
                # Success
                return response

            except FatalError as e:
                # Non-retryable error
                self.log.error(f"Fatal error after {total_attempts} attempts: {e}")
                raise

            except APIError as e:
                total_attempts += 1

                # Check if we should use Retry-After header
                if should_use_retry_after(e.status) and e.response:
                    retry_after = parse_retry_after(e.response)
                    if retry_after:
                        self.log.debug(
                            f"Retry-After header present: waiting {retry_after}s (attempt {total_attempts})"
                        )
                        time.sleep(retry_after)
                        continue  # Does not count against backoff budget

                # Check if status is retryable
                if not is_retryable_status(e.status):
                    self.log.error(
                        f"Non-retryable error {e.status} after {total_attempts} attempts: {e}"
                    )
                    raise

                # Count this against backoff attempts
                backoff_attempts += 1
                if backoff_attempts >= max_backoff_attempts:
                    self.log.error(
                        f"All {self.retries} retries exhausted after {total_attempts} total attempts. Final error: {e}"
                    )
                    raise

                # Calculate exponential backoff delay with jitter
                base_delay = 0.5 * (2 ** (backoff_attempts - 1))
                jitter = random.uniform(0, 0.1 * base_delay)
                delay = min(base_delay + jitter, 60)  # Cap at 60 seconds

                self.log.debug(
                    f"Retry attempt {backoff_attempts}/{self.retries} (total attempts: {total_attempts}) "
                    f"after {delay:.2f}s for status {e.status}"
                )
                time.sleep(delay)

            except Exception as e:
                # Network errors or other exceptions - retry with backoff
                total_attempts += 1
                backoff_attempts += 1

                if backoff_attempts >= max_backoff_attempts:
                    self.log.error(
                        f"All {self.retries} retries exhausted after {total_attempts} total attempts. Final error: {e}"
                    )
                    raise

                # Calculate exponential backoff delay with jitter
                base_delay = 0.5 * (2 ** (backoff_attempts - 1))
                jitter = random.uniform(0, 0.1 * base_delay)
                delay = min(base_delay + jitter, 60)  # Cap at 60 seconds

                self.log.debug(
                    f"Network error retry {backoff_attempts}/{self.retries} (total attempts: {total_attempts}) "
                    f"after {delay:.2f}s: {e}"
                )
                time.sleep(delay)