|
| 1 | +from unittest.mock import Mock |
| 2 | + |
| 3 | +from quixstreams.models import Topic, TopicConfig |
| 4 | +from quixstreams.processing.watermarking import WatermarkManager |
| 5 | + |
| 6 | + |
| 7 | +class TestWatermarkBootstrap: |
| 8 | + """ |
| 9 | + Basic tests for watermark bootstrapping. |
| 10 | +
|
| 11 | + Note: Full testing of the progressive search algorithm requires integration |
| 12 | + tests with a real Kafka broker, as mocking the complex message polling and |
| 13 | + deserialization flow is error-prone and doesn't provide meaningful coverage. |
| 14 | + """ |
| 15 | + |
| 16 | + def test_bootstrap_watermarks_empty_topic(self, topic_manager_factory): |
| 17 | + """ |
| 18 | + Test that bootstrap handles an empty watermarks topic gracefully. |
| 19 | + """ |
| 20 | + topic_manager = topic_manager_factory() |
| 21 | + producer = Mock() |
| 22 | + wm_manager = WatermarkManager( |
| 23 | + producer=producer, topic_manager=topic_manager, interval=1.0 |
| 24 | + ) |
| 25 | + |
| 26 | + test_topic = Topic( |
| 27 | + name="topic1", |
| 28 | + value_deserializer="json", |
| 29 | + create_config=TopicConfig(num_partitions=1, replication_factor=1), |
| 30 | + ) |
| 31 | + test_topic.broker_config = test_topic.create_config |
| 32 | + wm_manager.set_topics([test_topic]) |
| 33 | + |
| 34 | + consumer = Mock() |
| 35 | + consumer.get_watermark_offsets.return_value = (0, 0) # Empty topic |
| 36 | + |
| 37 | + wm_manager.bootstrap_watermarks(consumer) |
| 38 | + |
| 39 | + # Watermark should remain at -1 |
| 40 | + assert wm_manager._watermarks[("topic1", 0)] == -1 |
| 41 | + |
| 42 | + # No seek should be called for empty topic |
| 43 | + consumer.seek.assert_not_called() |
| 44 | + |
| 45 | + def test_bootstrap_watermarks_no_expected_tps(self, topic_manager_factory): |
| 46 | + """ |
| 47 | + Test that bootstrap handles the case where no topic-partitions are expected. |
| 48 | + """ |
| 49 | + topic_manager = topic_manager_factory() |
| 50 | + producer = Mock() |
| 51 | + wm_manager = WatermarkManager( |
| 52 | + producer=producer, topic_manager=topic_manager, interval=1.0 |
| 53 | + ) |
| 54 | + |
| 55 | + # Don't set any topics - no expected TPs |
| 56 | + consumer = Mock() |
| 57 | + |
| 58 | + wm_manager.bootstrap_watermarks(consumer) |
| 59 | + |
| 60 | + # Should exit early without calling get_watermark_offsets |
| 61 | + consumer.get_watermark_offsets.assert_not_called() |
| 62 | + |
| 63 | + def test_bootstrap_watermarks_exponential_backoff(self, topic_manager_factory): |
| 64 | + """ |
| 65 | + Test that bootstrap uses exponential backoff when not all TPs are found. |
| 66 | +
|
| 67 | + This test verifies true exponential backoff WITHOUT re-reading: |
| 68 | + 1. Initial seek: offset 900 (1000 - 100), read 100 messages to offset 1000 |
| 69 | + 2. If not all TPs found, seek back: offset 700 (900 - 200), read 200 messages to offset 900 |
| 70 | + 3. If still not found, seek back: offset 300 (700 - 400), read 400 messages to offset 700 |
| 71 | + 4. Continues until all TPs found or offset 0 is reached |
| 72 | +
|
| 73 | + Key: Each iteration seeks BACK from the previous position, not from high_offset. |
| 74 | + This avoids re-reading the same messages multiple times. |
| 75 | + """ |
| 76 | + topic_manager = topic_manager_factory() |
| 77 | + producer = Mock() |
| 78 | + wm_manager = WatermarkManager( |
| 79 | + producer=producer, topic_manager=topic_manager, interval=1.0 |
| 80 | + ) |
| 81 | + |
| 82 | + # Set up 3 topic-partitions |
| 83 | + test_topic = Topic( |
| 84 | + name="topic1", |
| 85 | + value_deserializer="json", |
| 86 | + create_config=TopicConfig(num_partitions=3, replication_factor=1), |
| 87 | + ) |
| 88 | + test_topic.broker_config = test_topic.create_config |
| 89 | + wm_manager.set_topics([test_topic]) |
| 90 | + |
| 91 | + consumer = Mock() |
| 92 | + consumer.get_watermark_offsets.return_value = (0, 1000) |
| 93 | + |
| 94 | + # Track which iteration we're in |
| 95 | + poll_count = [0] |
| 96 | + found_partitions = set() |
| 97 | + |
| 98 | + def mock_poll(timeout): |
| 99 | + poll_count[0] += 1 |
| 100 | + |
| 101 | + # First iteration: seeking to 900, reading toward 1000 |
| 102 | + # Return messages for partitions 0 and 1 only |
| 103 | + if consumer.seek.call_count == 1: |
| 104 | + if poll_count[0] <= 100: |
| 105 | + # Return watermarks for partitions 0 and 1 |
| 106 | + partition = 0 if poll_count[0] % 2 == 0 else 1 |
| 107 | + if partition not in found_partitions: |
| 108 | + found_partitions.add(partition) |
| 109 | + return create_mock_watermark_message( |
| 110 | + wm_manager.watermarks_topic, |
| 111 | + "topic1", |
| 112 | + partition, |
| 113 | + 1000 + poll_count[0], |
| 114 | + ) |
| 115 | + else: |
| 116 | + return None # Timeout to trigger next iteration |
| 117 | + |
| 118 | + # Second iteration: seeking to 700 (900 - 200), reading toward 900 |
| 119 | + # Now include partition 2 |
| 120 | + elif consumer.seek.call_count == 2: |
| 121 | + # Return watermark for partition 2 |
| 122 | + if poll_count[0] % 3 == 0: |
| 123 | + found_partitions.add(2) |
| 124 | + return create_mock_watermark_message( |
| 125 | + wm_manager.watermarks_topic, "topic1", 2, 3000 |
| 126 | + ) |
| 127 | + return None # Return None to speed up the loop |
| 128 | + |
| 129 | + return None |
| 130 | + |
| 131 | + consumer.poll.side_effect = mock_poll |
| 132 | + |
| 133 | + # Run bootstrap |
| 134 | + wm_manager.bootstrap_watermarks(consumer) |
| 135 | + |
| 136 | + # Verify exponential backoff happened |
| 137 | + assert consumer.seek.call_count >= 2, "Should have seeked at least twice" |
| 138 | + |
| 139 | + # Verify seek offsets show TRUE exponential backoff (seeking backwards, not from high_offset) |
| 140 | + seek_calls = consumer.seek.call_args_list |
| 141 | + first_seek = seek_calls[0][0][0] |
| 142 | + second_seek = seek_calls[1][0][0] |
| 143 | + |
| 144 | + # First seek: offset = 1000 - 100 = 900 |
| 145 | + assert ( |
| 146 | + first_seek.offset == 900 |
| 147 | + ), f"First seek should be at 900, got {first_seek.offset}" |
| 148 | + |
| 149 | + # Second seek: offset = 900 - 200 = 700 (seeking back from previous position) |
| 150 | + assert ( |
| 151 | + second_seek.offset == 700 |
| 152 | + ), f"Second seek should be at 700 (900 - 200), got {second_seek.offset}" |
| 153 | + |
| 154 | + # All watermarks should be set |
| 155 | + assert wm_manager._watermarks[("topic1", 0)] > -1 |
| 156 | + assert wm_manager._watermarks[("topic1", 1)] > -1 |
| 157 | + assert wm_manager._watermarks[("topic1", 2)] > -1 |
| 158 | + |
| 159 | + |
| 160 | +def create_mock_watermark_message(watermarks_topic, topic, partition, timestamp): |
| 161 | + """ |
| 162 | + Helper to create a properly mocked Kafka message. |
| 163 | + """ |
| 164 | + msg = Mock() |
| 165 | + msg.error.return_value = None |
| 166 | + msg.topic.return_value = watermarks_topic.name |
| 167 | + msg.partition.return_value = 0 |
| 168 | + msg.offset.return_value = 1 |
| 169 | + |
| 170 | + # Create a mock Row that will be returned by deserialize |
| 171 | + mock_row = Mock() |
| 172 | + mock_row.value = { |
| 173 | + "topic": topic, |
| 174 | + "partition": partition, |
| 175 | + "timestamp": timestamp, |
| 176 | + } |
| 177 | + |
| 178 | + # Mock the deserialize method to return our row |
| 179 | + watermarks_topic.deserialize = Mock(return_value=mock_row) |
| 180 | + |
| 181 | + return msg |
0 commit comments