Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(low-code): Fix declarative low-code state migration in SubstreamPartitionRouter #267

Merged
merged 4 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,10 @@ def _set_initial_state(self, stream_state: StreamState) -> None:
if not stream_state:
return

if self._PERPARTITION_STATE_KEY not in stream_state:
if (
self._PERPARTITION_STATE_KEY not in stream_state
and self._GLOBAL_STATE_KEY not in stream_state
):
# We assume that `stream_state` is in a global format that can be applied to all partitions.
# Example: {"global_state_format_key": "global_state_format_value"}
self._global_cursor = deepcopy(stream_state)
Expand All @@ -273,7 +276,7 @@ def _set_initial_state(self, stream_state: StreamState) -> None:
else:
self._lookback_window = int(stream_state.get("lookback_window", 0))

for state in stream_state[self._PERPARTITION_STATE_KEY]:
for state in stream_state.get(self._PERPARTITION_STATE_KEY, []):
self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
self._create_cursor(state["cursor"])
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -295,28 +295,58 @@ def set_initial_state(self, stream_state: StreamState) -> None:
return

if not parent_state and incremental_dependency:
# Attempt to retrieve child state
substream_state_values = list(stream_state.values())
substream_state = substream_state_values[0] if substream_state_values else {}
# Filter out per partition state. Because we pass the state to the parent stream in the format {cursor_field: substream_state}
if isinstance(substream_state, (list, dict)):
substream_state = {}

parent_state = {}

# Copy child state to parent streams with incremental dependencies
if substream_state:
for parent_config in self.parent_stream_configs:
if parent_config.incremental_dependency:
parent_state[parent_config.stream.name] = {
parent_config.stream.cursor_field: substream_state
}
# Migrate child state to parent state format
parent_state = self._migrate_child_state_to_parent_state(stream_state)

# Set state for each parent stream with an incremental dependency
for parent_config in self.parent_stream_configs:
if parent_config.incremental_dependency:
parent_config.stream.state = parent_state.get(parent_config.stream.name, {})

def _migrate_child_state_to_parent_state(self, stream_state: StreamState) -> StreamState:
"""
Migrate the child stream state to the parent stream's state format.

This method converts the global or child state into a format compatible with parent
streams. The migration occurs only for parent streams with incremental dependencies.
The method filters out per-partition states and retains only the global state in the
format `{cursor_field: cursor_value}`.

Args:
stream_state (StreamState): The state to migrate. Expected formats include:
- {"updated_at": "2023-05-27T00:00:00Z"}
- {"states": [...] } (ignored during migration)

Returns:
StreamState: A migrated state for parent streams in the format:
{
"parent_stream_name": {"parent_stream_cursor": "2023-05-27T00:00:00Z"}
}

Example:
Input: {"updated_at": "2023-05-27T00:00:00Z"}
Output: {
"parent_stream_name": {"parent_stream_cursor": "2023-05-27T00:00:00Z"}
}
"""
substream_state_values = list(stream_state.values())
substream_state = substream_state_values[0] if substream_state_values else {}

# Ignore per-partition states or invalid formats
if isinstance(substream_state, (list, dict)) or len(substream_state_values) != 1:
return {}

# Copy child state to parent streams with incremental dependencies
parent_state = {}
if substream_state:
for parent_config in self.parent_stream_configs:
if parent_config.incremental_dependency:
parent_state[parent_config.stream.name] = {
parent_config.stream.cursor_field: substream_state
}

return parent_state

def get_stream_state(self) -> Optional[Mapping[str, StreamState]]:
"""
Get the state of the parent streams.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -464,13 +464,23 @@ def test_substream_partition_router_invalid_parent_record_type():
},
{"parent_stream_cursor": "2023-05-27T00:00:00Z"},
),
# Case 6: Declarative global cursor state, no migration expected
(
{
"looback_window": 1,
"use_global_cursor": True,
"state": {"updated": "2023-05-27T00:00:00Z"},
},
{},
),
],
ids=[
"empty_initial_state",
"initial_state_no_parent_legacy_state",
"initial_state_no_parent_global_state",
"initial_state_no_parent_per_partition_state",
"initial_state_with_parent_state",
"initial_state_no_parent_global_state_declarative",
],
)
def test_set_initial_state(initial_state, expected_parent_state):
Expand Down
Loading