Skip to content

Commit

Permalink
fix(low-code): Fix declarative low-code state migration in SubstreamP…
Browse files Browse the repository at this point in the history
…artitionRouter (#267)
  • Loading branch information
tolik0 authored Jan 24, 2025
1 parent b1824c6 commit 0a12a58
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def close_partition(self, partition: Partition) -> None:
< cursor.state[self.cursor_field.cursor_field_key]
):
self._new_global_cursor = copy.deepcopy(cursor.state)
self._emit_state_message()
self._emit_state_message()

def ensure_at_least_one_state_emitted(self) -> None:
"""
Expand Down Expand Up @@ -192,7 +192,8 @@ def _generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[St
self._global_cursor,
self._lookback_window if self._global_cursor else 0,
)
self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
with self._lock:
self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
threading.Semaphore(0)
)
Expand All @@ -210,16 +211,38 @@ def _generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[St

def _ensure_partition_limit(self) -> None:
"""
Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
Ensure the maximum number of partitions does not exceed the predefined limit.
Steps:
1. Attempt to remove partitions that are marked as finished in `_finished_partitions`.
These partitions are considered processed and safe to delete.
2. If the limit is still exceeded and no finished partitions are available for removal,
remove the oldest partition unconditionally. We expect failed partitions to be removed.
Logging:
- Logs a warning each time a partition is removed, indicating whether it was finished
or removed due to being the oldest.
"""
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
self._over_limit += 1
oldest_partition = self._cursor_per_partition.popitem(last=False)[
0
] # Remove the oldest partition
logger.warning(
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
)
with self._lock:
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
# Try removing finished partitions first
for partition_key in list(self._cursor_per_partition.keys()):
if partition_key in self._finished_partitions:
oldest_partition = self._cursor_per_partition.pop(
partition_key
) # Remove the oldest partition
logger.warning(
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
)
break
else:
# If no finished partitions can be removed, fall back to removing the oldest partition
oldest_partition = self._cursor_per_partition.popitem(last=False)[
1
] # Remove the oldest partition
logger.warning(
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
)

def _set_initial_state(self, stream_state: StreamState) -> None:
"""
Expand Down Expand Up @@ -264,7 +287,10 @@ def _set_initial_state(self, stream_state: StreamState) -> None:
if not stream_state:
return

if self._PERPARTITION_STATE_KEY not in stream_state:
if (
self._PERPARTITION_STATE_KEY not in stream_state
and self._GLOBAL_STATE_KEY not in stream_state
):
# We assume that `stream_state` is in a global format that can be applied to all partitions.
# Example: {"global_state_format_key": "global_state_format_value"}
self._global_cursor = deepcopy(stream_state)
Expand All @@ -273,7 +299,7 @@ def _set_initial_state(self, stream_state: StreamState) -> None:
else:
self._lookback_window = int(stream_state.get("lookback_window", 0))

for state in stream_state[self._PERPARTITION_STATE_KEY]:
for state in stream_state.get(self._PERPARTITION_STATE_KEY, []):
self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
self._create_cursor(state["cursor"])
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -295,28 +295,58 @@ def set_initial_state(self, stream_state: StreamState) -> None:
return

if not parent_state and incremental_dependency:
# Attempt to retrieve child state
substream_state_values = list(stream_state.values())
substream_state = substream_state_values[0] if substream_state_values else {}
# Filter out per partition state. Because we pass the state to the parent stream in the format {cursor_field: substream_state}
if isinstance(substream_state, (list, dict)):
substream_state = {}

parent_state = {}

# Copy child state to parent streams with incremental dependencies
if substream_state:
for parent_config in self.parent_stream_configs:
if parent_config.incremental_dependency:
parent_state[parent_config.stream.name] = {
parent_config.stream.cursor_field: substream_state
}
# Migrate child state to parent state format
parent_state = self._migrate_child_state_to_parent_state(stream_state)

# Set state for each parent stream with an incremental dependency
for parent_config in self.parent_stream_configs:
if parent_config.incremental_dependency:
parent_config.stream.state = parent_state.get(parent_config.stream.name, {})

def _migrate_child_state_to_parent_state(self, stream_state: StreamState) -> StreamState:
"""
Migrate the child stream state to the parent stream's state format.
This method converts the global or child state into a format compatible with parent
streams. The migration occurs only for parent streams with incremental dependencies.
The method filters out per-partition states and retains only the global state in the
format `{cursor_field: cursor_value}`.
Args:
stream_state (StreamState): The state to migrate. Expected formats include:
- {"updated_at": "2023-05-27T00:00:00Z"}
- {"states": [...] } (ignored during migration)
Returns:
StreamState: A migrated state for parent streams in the format:
{
"parent_stream_name": {"parent_stream_cursor": "2023-05-27T00:00:00Z"}
}
Example:
Input: {"updated_at": "2023-05-27T00:00:00Z"}
Output: {
"parent_stream_name": {"parent_stream_cursor": "2023-05-27T00:00:00Z"}
}
"""
substream_state_values = list(stream_state.values())
substream_state = substream_state_values[0] if substream_state_values else {}

# Ignore per-partition states or invalid formats
if isinstance(substream_state, (list, dict)) or len(substream_state_values) != 1:
return {}

# Copy child state to parent streams with incremental dependencies
parent_state = {}
if substream_state:
for parent_config in self.parent_stream_configs:
if parent_config.incremental_dependency:
parent_state[parent_config.stream.name] = {
parent_config.stream.cursor_field: substream_state
}

return parent_state

def get_stream_state(self) -> Optional[Mapping[str, StreamState]]:
"""
Get the state of the parent streams.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1171,7 +1171,7 @@ def test_incremental_parent_state(


@pytest.mark.parametrize(
"test_name, manifest, mock_requests, expected_records, initial_state, expected_state",
"test_name, manifest, mock_requests, expected_records, expected_state",
[
(
"test_incremental_parent_state",
Expand Down Expand Up @@ -1326,8 +1326,6 @@ def test_incremental_parent_state(
"id": 300,
},
],
# Initial state
{"created_at": PARTITION_SYNC_START_TIME},
# Expected state
{
"state": {"created_at": VOTE_100_CREATED_AT},
Expand Down Expand Up @@ -1384,6 +1382,25 @@ def test_incremental_parent_state(
),
],
)
@pytest.mark.parametrize(
"initial_state",
[
{"created_at": PARTITION_SYNC_START_TIME},
{
"state": {"created_at": PARTITION_SYNC_START_TIME},
"lookback_window": 0,
"use_global_cursor": True,
"parent_state": {
"post_comments": {
"state": {"updated_at": PARTITION_SYNC_START_TIME},
"parent_state": {"posts": {"updated_at": PARTITION_SYNC_START_TIME}},
"lookback_window": 0,
}
},
},
],
ids=["legacy_python_format", "low_code_global_format"],
)
def test_incremental_parent_state_migration(
test_name, manifest, mock_requests, expected_records, initial_state, expected_state
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -464,13 +464,23 @@ def test_substream_partition_router_invalid_parent_record_type():
},
{"parent_stream_cursor": "2023-05-27T00:00:00Z"},
),
# Case 6: Declarative global cursor state, no migration expected
(
{
"looback_window": 1,
"use_global_cursor": True,
"state": {"updated": "2023-05-27T00:00:00Z"},
},
{},
),
],
ids=[
"empty_initial_state",
"initial_state_no_parent_legacy_state",
"initial_state_no_parent_global_state",
"initial_state_no_parent_per_partition_state",
"initial_state_with_parent_state",
"initial_state_no_parent_global_state_declarative",
],
)
def test_set_initial_state(initial_state, expected_parent_state):
Expand Down

0 comments on commit 0a12a58

Please sign in to comment.