-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Adds
ZipfileDecoder
component (#169)
- Loading branch information
Showing
6 changed files
with
191 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
59 changes: 59 additions & 0 deletions
59
airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
# | ||
# Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
# | ||
|
||
import logging | ||
import zipfile | ||
from dataclasses import dataclass | ||
from io import BytesIO | ||
from typing import Any, Generator, MutableMapping | ||
|
||
import orjson | ||
import requests | ||
|
||
from airbyte_cdk.models import FailureType | ||
from airbyte_cdk.sources.declarative.decoders import Decoder | ||
from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import ( | ||
Parser, | ||
) | ||
from airbyte_cdk.utils import AirbyteTracedException | ||
|
||
logger = logging.getLogger("airbyte") | ||
|
||
|
||
@dataclass | ||
class ZipfileDecoder(Decoder): | ||
parser: Parser | ||
|
||
def is_stream_response(self) -> bool: | ||
return False | ||
|
||
def decode( | ||
self, response: requests.Response | ||
) -> Generator[MutableMapping[str, Any], None, None]: | ||
try: | ||
with zipfile.ZipFile(BytesIO(response.content)) as zip_file: | ||
for file_name in zip_file.namelist(): | ||
unzipped_content = zip_file.read(file_name) | ||
buffered_content = BytesIO(unzipped_content) | ||
try: | ||
yield from self.parser.parse(buffered_content) | ||
except Exception as e: | ||
logger.error( | ||
f"Failed to parse file: {file_name} from zip file: {response.request.url} with exception {e}." | ||
) | ||
raise AirbyteTracedException( | ||
message=f"Failed to parse file: {file_name} from zip file.", | ||
internal_message=f"Failed to parse file: {file_name} from zip file: {response.request.url}.", | ||
failure_type=FailureType.system_error, | ||
) from e | ||
except zipfile.BadZipFile as e: | ||
logger.error( | ||
f"Received an invalid zip file in response to URL: {response.request.url}. " | ||
f"The size of the response body is: {len(response.content)}" | ||
) | ||
raise AirbyteTracedException( | ||
message="Received an invalid zip file in response.", | ||
internal_message=f"Received an invalid zip file in response to URL: {response.request.url}.", | ||
failure_type=FailureType.system_error, | ||
) from e |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
68 changes: 68 additions & 0 deletions
68
unit_tests/sources/declarative/decoders/test_zipfile_decoder.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
# | ||
# Copyright (c) 2025 Airbyte, Inc., all rights reserved. | ||
# | ||
import gzip | ||
import json | ||
import zipfile | ||
from io import BytesIO | ||
from typing import Union | ||
|
||
import pytest | ||
import requests | ||
|
||
from airbyte_cdk.sources.declarative.decoders import GzipParser, JsonParser, ZipfileDecoder | ||
|
||
|
||
def create_zip_from_dict(data: Union[dict, list]) -> bytes: | ||
zip_buffer = BytesIO() | ||
with zipfile.ZipFile(zip_buffer, mode="w") as zip_file: | ||
zip_file.writestr("data.json", data) | ||
return zip_buffer.getvalue() | ||
|
||
|
||
def create_multi_zip_from_dict(data: list) -> bytes: | ||
zip_buffer = BytesIO() | ||
|
||
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file: | ||
for i, content in enumerate(data): | ||
file_content = json.dumps(content).encode("utf-8") | ||
zip_file.writestr(f"file_{i}.json", file_content) | ||
return zip_buffer.getvalue() | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"json_data", | ||
[ | ||
{"test": "test"}, | ||
{"responses": [{"id": 1}, {"id": 2}]}, | ||
[{"id": 1}, {"id": 2}], | ||
{}, | ||
], | ||
) | ||
def test_zipfile_decoder_with_single_file_response(requests_mock, json_data): | ||
zipfile_decoder = ZipfileDecoder(parser=GzipParser(inner_parser=JsonParser())) | ||
compressed_data = gzip.compress(json.dumps(json_data).encode()) | ||
zipped_data = create_zip_from_dict(compressed_data) | ||
requests_mock.register_uri("GET", "https://airbyte.io/", content=zipped_data) | ||
response = requests.get("https://airbyte.io/") | ||
|
||
if isinstance(json_data, list): | ||
for i, actual in enumerate(zipfile_decoder.decode(response=response)): | ||
assert actual == json_data[i] | ||
else: | ||
assert next(zipfile_decoder.decode(response=response)) == json_data | ||
|
||
|
||
def test_zipfile_decoder_with_multi_file_response(requests_mock): | ||
data_to_zip = [{"key1": "value1"}, {"key2": "value2"}, {"key3": "value3"}] | ||
|
||
mocked_response = create_multi_zip_from_dict(data_to_zip) | ||
|
||
decoder = ZipfileDecoder(parser=JsonParser()) | ||
requests_mock.register_uri("GET", "https://airbyte.io/", content=mocked_response) | ||
response = requests.get("https://airbyte.io/") | ||
results = list(decoder.decode(response)) | ||
|
||
assert len(results) == 3 | ||
for i, actual in enumerate(results): | ||
assert actual == data_to_zip[i] |