Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor!: change Session cookies from dict to SessionCookies with CookieJar #984

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 7 additions & 10 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from pydantic import ValidationError
from typing_extensions import NotRequired, TypedDict, TypeVar
from yarl import URL

from crawlee import EnqueueStrategy, RequestTransformAction
from crawlee._request import Request, RequestOptions
Expand Down Expand Up @@ -202,7 +201,8 @@ async def _navigate(
"""
async with context.page:
if context.session:
await self._set_cookies(context.page, context.request.url, context.session.cookies)
cookies = context.session.cookies.get_cookies_as_browser_format()
await self._set_cookies(context.page, cookies)

if context.request.headers:
await context.page.set_extra_http_headers(context.request.headers.model_dump())
Expand All @@ -217,7 +217,7 @@ async def _navigate(

if context.session:
cookies = await self._get_cookies(context.page)
context.session.cookies.update(cookies)
context.session.cookies.set_cookies(cookies)

async def enqueue_links(
*,
Expand Down Expand Up @@ -329,17 +329,14 @@ def pre_navigation_hook(self, hook: Callable[[PlaywrightPreNavCrawlingContext],
"""
self._pre_navigation_hooks.append(hook)

async def _get_cookies(self, page: Page) -> dict[str, str]:
async def _get_cookies(self, page: Page) -> list[dict[str, Any]]:
"""Get the cookies from the page."""
cookies = await page.context.cookies()
return {cookie['name']: cookie['value'] for cookie in cookies if cookie.get('name') and cookie.get('value')}
return [{**cookie} for cookie in cookies]

async def _set_cookies(self, page: Page, url: str, cookies: dict[str, str]) -> None:
async def _set_cookies(self, page: Page, cookies: list[dict[str, Any]]) -> None:
"""Set the cookies to the page."""
parsed_url = URL(url)
await page.context.add_cookies(
[{'name': name, 'value': value, 'domain': parsed_url.host, 'path': '/'} for name, value in cookies.items()]
)
await page.context.add_cookies(cookies) # type: ignore[arg-type]


class _PlaywrightCrawlerAdditionalOptions(TypedDict):
Expand Down
16 changes: 9 additions & 7 deletions src/crawlee/http_clients/_curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

if TYPE_CHECKING:
from collections.abc import Iterable
from http.cookiejar import Cookie

from curl_cffi import Curl
from curl_cffi.requests import Request as CurlRequest
Expand Down Expand Up @@ -140,7 +141,7 @@ async def crawl(
method=request.method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method
headers=request.headers,
data=request.payload,
cookies=session.cookies if session else None,
cookies=session.cookies.jar if session else None,
allow_redirects=True,
)
except CurlRequestError as exc:
Expand All @@ -159,7 +160,7 @@ async def crawl(

if self._persist_cookies_per_session and session and response.curl:
response_cookies = self._get_cookies(response.curl)
session.cookies.update(response_cookies)
session.cookies.store_cookies(response_cookies)

request.loaded_url = response.url

Expand Down Expand Up @@ -190,7 +191,7 @@ async def send_request(
method=method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method
headers=dict(headers) if headers else None,
data=payload,
cookies=session.cookies if session else None,
cookies=session.cookies.jar if session else None,
allow_redirects=True,
)
except CurlRequestError as exc:
Expand All @@ -206,7 +207,7 @@ async def send_request(

if self._persist_cookies_per_session and session and response.curl:
response_cookies = self._get_cookies(response.curl)
session.cookies.update(response_cookies)
session.cookies.store_cookies(response_cookies)

return _CurlImpersonateResponse(response)

Expand Down Expand Up @@ -247,9 +248,10 @@ def _is_proxy_error(error: CurlRequestError) -> bool:
return False

@staticmethod
def _get_cookies(curl: Curl) -> dict[str, str]:
cookies = {}
def _get_cookies(curl: Curl) -> list[Cookie]:
cookies: list[Cookie] = []
for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST): # type: ignore[union-attr]
curl_morsel = CurlMorsel.from_curl_format(curl_cookie) # type: ignore[arg-type]
cookies[curl_morsel.name] = curl_morsel.value
cookie = curl_morsel.to_cookiejar_cookie()
cookies.append(cookie)
return cookies
6 changes: 2 additions & 4 deletions src/crawlee/http_clients/_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,7 @@ async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
response.request = request

if session := cast(Session, request.extensions.get('crawlee_session')):
response_cookies = httpx.Cookies()
response_cookies.extract_cookies(response)
session.cookies.update(response_cookies)
session.cookies.extract_cookies(response)

if 'Set-Cookie' in response.headers:
del response.headers['Set-Cookie']
Expand Down Expand Up @@ -160,7 +158,7 @@ async def crawl(
method=request.method,
headers=headers,
content=request.payload,
cookies=session.cookies if session else None,
cookies=session.cookies.jar if session else None,
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
)

Expand Down
3 changes: 2 additions & 1 deletion src/crawlee/sessions/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from ._cookies import SessionCookies
from ._session import Session
from ._session_pool import SessionPool

__all__ = ['Session', 'SessionPool']
__all__ = ['Session', 'SessionCookies', 'SessionPool']
161 changes: 161 additions & 0 deletions src/crawlee/sessions/_cookies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
from __future__ import annotations

from copy import deepcopy
from http.cookiejar import Cookie
from typing import Any, ClassVar, Literal

from httpx import Cookies
from typing_extensions import override


class SessionCookies(Cookies):
"""Cookie manager with browser-compatible serialization and deserialization.

Extends httpx.Cookies with support for browser-specific cookie attributes,
format conversion and cookie dictionary representations.
"""

_ATTRIBUTE_MAPPING: ClassVar[dict[str, str]] = {'http_only': 'httpOnly', 'same_site': 'sameSite'}
"""Mapping between internal cookie attribute names and their browser-compatible counterparts."""

@override
def set(
self,
name: str,
value: str,
domain: str = '',
path: str = '/',
expires: int | None = None,
http_only: bool = False,
secure: bool = False,
same_site: Literal['Lax', 'None', 'Strict'] | None = None,
) -> None:
"""Create and store a cookie with modern browser attributes.

Args:
name: Cookie name.
value: Cookie value.
domain: Cookie domain.
path: Cookie path.
expires: Cookie expiration timestamp.
http_only: Whether cookie is HTTP-only.
secure: Whether cookie requires secure context.
same_site: SameSite cookie attribute value.
"""
cookie = Cookie(
version=0,
name=name,
value=value,
port=None,
port_specified=False,
domain=domain,
domain_specified=bool(domain),
domain_initial_dot=domain.startswith('.'),
path=path,
path_specified=bool(path),
secure=secure,
expires=expires,
discard=True,
comment=None,
comment_url=None,
rest={'HttpOnly': ''} if http_only else {},
rfc2109=False,
)

if same_site:
cookie.set_nonstandard_attr('SameSite', same_site)

self.jar.set_cookie(cookie)

def _convert_cookie_to_dict(self, cookie: Cookie) -> dict[str, Any]:
"""Convert Cookie object to dictionary format.

Args:
cookie: Cookie object to convert.
"""
cookie_dict: dict[str, Any] = {
'name': cookie.name,
'value': cookie.value if cookie.value else '',
'domain': cookie.domain,
'path': cookie.path,
'secure': cookie.secure,
'http_only': cookie.has_nonstandard_attr('HttpOnly'),
}

if cookie.expires:
cookie_dict['expires'] = cookie.expires

if cookie.has_nonstandard_attr('SameSite'):
cookie_dict['same_site'] = cookie.get_nonstandard_attr('SameSite')

return cookie_dict

def _normalize_cookie_attributes(self, cookie_dict: dict[str, Any], *, reverse: bool = False) -> dict[str, Any]:
"""Convert cookie attribute keys between internal and browser formats.

Args:
cookie_dict: Dictionary with cookie attributes.
reverse: If True, converts from internal to browser format.
"""
new_cookie_dict: dict[str, Any] = cookie_dict.copy()

for key_pair in self._ATTRIBUTE_MAPPING.items():
new_key, old_key = key_pair
if reverse:
old_key, new_key = new_key, old_key

if old_key in new_cookie_dict:
new_cookie_dict[new_key] = new_cookie_dict.pop(old_key)

return new_cookie_dict

def get_cookies_as_dicts(self) -> list[dict[str, Any]]:
"""Convert cookies to a list format for persistence."""
return [self._convert_cookie_to_dict(cookie) for cookie in self.jar]

def get_cookies_as_browser_format(self) -> list[dict[str, Any]]:
"""Get cookies in browser-compatible format."""
return [self._normalize_cookie_attributes(cookie, reverse=True) for cookie in self.get_cookies_as_dicts()]

@classmethod
def from_dict_list(cls, data: list[dict[str, Any]]) -> SessionCookies:
"""Create a new SessionCookies instance from dictionary representations.

Args:
data: List of dictionaries where each dict represents cookie parameters.
"""
cookies = cls()
cookies.set_cookies(data)
return cookies

def store_cookie(self, cookie: Cookie) -> None:
"""Store a Cookie object in the session cookie jar.

Args:
cookie: The Cookie object to store in the jar.
"""
self.jar.set_cookie(cookie)

def store_cookies(self, cookies: list[Cookie]) -> None:
"""Store multiple Cookie objects in the session cookie jar.

Args:
cookies: A list of Cookie objects to store in the jar.
"""
for cookie in cookies:
self.store_cookie(cookie)

def set_cookies(self, cookie_dicts: list[dict[str, Any]]) -> None:
"""Create and store cookies from their dictionary representations.

Args:
cookie_dicts: List of dictionaries where each dict represents cookie parameters.
"""
for cookie_dict in cookie_dicts:
normalized_cookie_dict = self._normalize_cookie_attributes(cookie_dict)
self.set(**normalized_cookie_dict)

def __deepcopy__(self, memo: dict[int, Any] | None) -> SessionCookies:
# This is necessary because cookijars use `RLock`, which prevents `deepcopy`.
cookie_dicts = self.get_cookies_as_dicts()
return self.__class__.from_dict_list(deepcopy(cookie_dicts, memo))
4 changes: 2 additions & 2 deletions src/crawlee/sessions/_models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from datetime import datetime, timedelta
from typing import Annotated
from typing import Annotated, Any

from pydantic import BaseModel, ConfigDict, Field

Expand All @@ -20,7 +20,7 @@ class SessionModel(BaseModel):
usage_count: Annotated[int, Field(alias='usageCount')]
max_usage_count: Annotated[int, Field(alias='maxUsageCount')]
error_score: Annotated[float, Field(alias='errorScore')]
cookies: Annotated[dict, Field(alias='cookies')]
cookies: Annotated[list[dict[str, Any]], Field(alias='cookies')]
blocked_status_codes: Annotated[list[int], Field(alias='blockedStatusCodes')]


Expand Down
17 changes: 11 additions & 6 deletions src/crawlee/sessions/_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@

from datetime import datetime, timedelta, timezone
from logging import getLogger
from typing import ClassVar, Literal, overload
from typing import TYPE_CHECKING, ClassVar, Literal, overload

from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.docs import docs_group
from crawlee.sessions._cookies import SessionCookies
from crawlee.sessions._models import SessionModel

if TYPE_CHECKING:
from http.cookiejar import CookieJar

logger = getLogger(__name__)


Expand Down Expand Up @@ -38,7 +42,7 @@ def __init__(
usage_count: int = 0,
max_usage_count: int = 50,
error_score: float = 0.0,
cookies: dict[str, str] | None = None,
cookies: SessionCookies | CookieJar | dict[str, str] | list[tuple[str, str]] | None = None,
blocked_status_codes: list | None = None,
) -> None:
"""A default constructor.
Expand All @@ -65,13 +69,14 @@ def __init__(
self._usage_count = usage_count
self._max_usage_count = max_usage_count
self._error_score = error_score
self._cookies = cookies or {}
self._cookies = SessionCookies(cookies) or SessionCookies()
self._blocked_status_codes = set(blocked_status_codes or self._DEFAULT_BLOCKED_STATUS_CODES)

@classmethod
def from_model(cls, model: SessionModel) -> Session:
"""Create a new instance from a `SessionModel`."""
return cls(**model.model_dump())
cookies = SessionCookies.from_dict_list(model.cookies)
return cls(**model.model_dump(exclude={'cookies'}), cookies=cookies)

def __repr__(self) -> str:
"""Get a string representation."""
Expand All @@ -94,7 +99,7 @@ def user_data(self) -> dict:
return self._user_data

@property
def cookies(self) -> dict[str, str]:
def cookies(self) -> SessionCookies:
"""Get the cookies."""
return self._cookies

Expand Down Expand Up @@ -151,7 +156,7 @@ def get_state(self, *, as_dict: bool = False) -> SessionModel | dict:
usage_count=self._usage_count,
max_usage_count=self._max_usage_count,
error_score=self._error_score,
cookies=self._cookies,
cookies=self._cookies.get_cookies_as_dicts(),
blocked_status_codes=self._blocked_status_codes,
)
if as_dict:
Expand Down
Loading
Loading