Merge pull request #81 from hynky1999/structure

Structure
hynky1999 · Sep 9, 2023 · 8e03b10 · 8e03b10
2 parents 95d12ef + 010d75c
commit 8e03b10
Show file tree

Hide file tree

Showing 27 changed files with 367 additions and 159 deletions.
diff --git a/.github/workflows/test_and_types.yml b/.github/workflows/test_and_types.yml
@@ -22,7 +22,7 @@ jobs:
           cache: "pip"
 
       - name: Install dependencies
-        run: pip install -r requirements.txt  # Replace with your dependencies installation command
+        run: pip install -r requirements.test.txt  # Replace with your dependencies installation command
 
       - name: Run tests
         run: python -m unittest discover -s tests -p "*_tests.py"  # Replace with your test command
@@ -40,7 +40,7 @@ jobs:
           cache: "pip"
 
       - name: Install dependencies
-        run: pip install -r requirements.txt && pip install -r requirements-test.txt
+        run: pip install -r requirements.dev.txt
 
       - name: Lint with pyright
         run: pyright

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,8 +8,16 @@ repos:
     -   id: end-of-file-fixer
     -   id: trailing-whitespace
     -   id: check-added-large-files
+
+-   repo: https://github.com/myint/autoflake
+    rev: v1.4
+    hooks:
+    -   id: autoflake
+        args: [--remove-all-unused-imports, --ignore-init-module-imports, --remove-unused-variables]
+        language_version: python3.11
+
 -   repo: https://github.com/psf/black
     rev: 23.3.0
     hooks:
     -   id: black
-        language_version: python3.10
+        language_version: python3.11
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -5,33 +5,22 @@
             "name": "Download",
             "type": "python",
             "request": "launch",
-            "program": "${workspaceRoot}/download_article.py",
+            "module": "cmoncrawl.integrations.commands",
             "console": "integratedTerminal",
-            "args": ["--limit=100", "--to=2018-12-31", "idnes.cz", "Processor/denik2019"]
+            "args": ["download", "idnes.cz", "out", "html"]
         },
         {
-            "name": "Process",
+            "name": "Extract",
             "type": "python",
             "request": "launch",
-            "program": "${workspaceRoot}/process_article.py",
+            "module": "cmoncrawl.integrations.commands",
             "console": "integratedTerminal",
-            "args": ["unit_tests/sites/seznamzpravyCZ/test_articles/article1.html", "./processed_articles"]
-        },
-        {
-            "name": "Processor",
-            "type": "python",
-            "request": "launch",
-            "program": "${workspaceRoot}/process_article.py",
-            "console": "integratedTerminal",
-            "args": ["unit_tests/sites/seznamzpravyCZ/test_articles/article1.html", "./processed_articles"]
-        },
-        {
-            "name": "FUN",
-            "type": "python",
-            "request": "launch",
-            "program": "${workspaceRoot}/Artemis/adjust_config.py",
-            "console": "integratedTerminal",
-            "args": ["unit_tests/sites/seznamzpravyCZ/test_articles/article1.html", "./processed_articles"]
+            "args": ["extract",
+            "examples/extractor_tutorial/config.json",
+            "out_extr",
+            "out_html/1_file.jsonl",
+            "record"],
+            "justMyCode": false
         },
 
     ]

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,6 +1,6 @@
 {
     "html.format.wrapLineLength": 80,
-    "python.formatting.provider": "black",
+    "python.formatting.provider": "none",
     "python.linting.enabled": true,
     "python.testing.unittestArgs": [
         "-v",
@@ -13,5 +13,8 @@
     "python.testing.unittestEnabled": true,
     "python.analysis.typeCheckingMode": "strict",
     "python.linting.mypyPath": "/usr/bin/mypy",
-    "cSpell.enabled": false
+    "cSpell.enabled": false,
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter"
+    }
 }
diff --git a/README.md b/README.md
@@ -4,6 +4,18 @@
 Unlike all other commoncrawl extractors, this project allows creation of custom extractors with high level of modularity.
 Unlike getting records from CmonCrawl index using Amazon's Athena this solution is completely free of cost :)
 
+### Installation
+#### From PyPi
+```bash
+$ pip install cmoncrawl
+```
+#### From source
+```bash
+$ git clone https://github.com/hynky1999/CmonCrawl
+$ cd CmonCrawl
+$ pip install -r requirements.txt
+$ pip install -e .
+```
 
 ### Usage
 
@@ -14,15 +26,18 @@ To create them you need an example html files you want to extract.
 You can use the following command to get html files from the CommonCrawl dataset:
 
 ```bash
-$ cmon download --match_type=domain --limit=1000 example.com html_output html
+$ cmon download --match_type=domain --limit=100 example.com html_output html
 ```
 This will download a first 100 html files from example.com and save them in html_output.
 
+
 #### Extractor creation
 Once you have your the files to extract, you can create your extractor.
 To do so, you need to create a new python file e.g my_extractor.py in extractors directory and add the following code:
 
 ```python
+from bs4 import BeautifulSoup
+from cmoncrawl.common.types import PipeMetadata
 from cmoncrawl.processor.pipeline.extractor import BaseExtractor
 class MyExtractor(BaseExtractor):
    def __init__(self):
@@ -32,6 +47,12 @@ class MyExtractor(BaseExtractor):
    def extract_soup(self, soup: BeautifulSoup, metadata: PipeMetadata):
       # here you can extract the data you want from the soup
       # and return a dict with the data you want to save
+      body = soup.select_one("body")
+      if body is None:
+        return None
+      return {
+         "body": body.get_text()
+      }
 
    # You can also override the following methods to drop the files you don't want to extracti
    # Return True to keep the file, False to drop it
@@ -61,8 +82,8 @@ In our case the config would look like this:
                 # You can use since and to choose the extractor based
                 on the date of the crawl
                 # You can ommit either of them
-                "since": "2009-01-01T00:00:00+00:00",
-                "to": "2009-01-01T00:00:00+00:00"
+                "since": "2009-01-01",
+                "to": "2025-01-01"
             }]
         },
         # More routes here
@@ -74,7 +95,7 @@ In our case the config would look like this:
 To test the extraction, you can use the following command:
 
 ```bash
-$ cmon extract config.json extracted_output html_output/*/*.html html
+$ cmon extract config.json extracted_output html_output/*.html html
 ```
 
 ### Crawl the sites
@@ -94,12 +115,13 @@ This will download the first 100000 records from example.com and save them in dr
 Once you have the records, you can use the following command to extract them:
 
 ```bash
-$ cmon extract --n_proc=4 config.json extracted_output dr_output/*/*.jsonl record
+$ cmon extract --n_proc=4 config.json extracted_output dr_output/*.jsonl record
 ```
 
 Note that you can use the --n_proc option to specify the number of processes to use for the extraction. Multiprocessing is done on file level, so if you have just one file it will not be used.
 
-
+### Other examples
+For other examples see [examples](https://github.com/hynky1999/CmonCrawl/tree/main/examples)
 ### Advanced usage
 The whole project was written with modularity in mind. That means that you
 can adjust the framework to your needs. To know more check  see [documentation](https://hynky1999.github.io/CmonCrawl/)

diff --git a/cmoncrawl/aggregator/index_query.py b/cmoncrawl/aggregator/index_query.py
@@ -5,7 +5,6 @@
 import re
 
 from cmoncrawl.aggregator.utils import ndjson
-import json
 from types import TracebackType
 from typing import (
     Any,
@@ -155,8 +154,8 @@ async def __retrieve(
         **args: Any,
     ):
         def should_retry(retry: int, reason: str, status: int, **args: Any):
-            # if logger at least info than report every retry otherwise report every 10 retries
-            if all_purpose_logger.level <= logging.INFO or retry % 10 == 0:
+            # if logger at least info then report every retry otherwise report every 10 retries
+            if all_purpose_logger.level <= logging.DEBUG or retry % 10 == 0:
                 all_purpose_logger.error(
                     f"Failed to retrieve page of {domain} from {cdx_server} with reason {status}: {reason} retry: {retry + 1}/{max_retry} add_info: {args}"
                 )

diff --git a/cmoncrawl/aggregator/utils/helpers.py b/cmoncrawl/aggregator/utils/helpers.py
@@ -16,7 +16,6 @@ def unify_url_id(url: str):
     if path_match:
         path_processed = path_match.group(0)
     else:
-        all_purpose_logger.warn(f"No path match for {url}")
         path_processed = ""
     path_processed = remove_trailing.sub("", path_processed)
     netloc = parsed.netloc

diff --git a/cmoncrawl/common/loggers.py b/cmoncrawl/common/loggers.py
@@ -31,3 +31,14 @@
         )
     )
     metadata_logger.addHandler(handler)
+
+
+def setup_loggers(verbosity: int):
+    verbosity_cfg = {
+        0: logging.WARNING,
+        1: logging.INFO,
+        2: logging.DEBUG,
+    }
+    verbosity = min(verbosity, max(verbosity_cfg.keys()))
+    all_purpose_logger.setLevel(verbosity_cfg[verbosity])
+    metadata_logger.setLevel(verbosity_cfg[verbosity])
diff --git a/cmoncrawl/common/types.py b/cmoncrawl/common/types.py
@@ -4,14 +4,27 @@
 from typing import Any, Dict, List
 from urllib.parse import urlparse
 from dataclasses import dataclass, field
-from marshmallow import fields
+from pydantic import BaseModel, Field, validator
+from typing import Optional, List
+from datetime import datetime
 
-from dataclasses_json import dataclass_json, config
+from pydantic import BaseModel
 
 
-@dataclass_json
-@dataclass
-class DomainRecord:
+def parse_timestamp(v: Optional[Any]) -> Optional[datetime]:
+    if v is None:
+        return None
+
+    if isinstance(v, datetime):
+        return v
+
+    if isinstance(v, str):
+        return datetime.fromisoformat(v)
+
+    raise ValueError(f"Invalid timestamp: {v}")
+
+
+class DomainRecord(BaseModel):
     """
     Domain record.
     """
@@ -22,9 +35,11 @@ class DomainRecord:
     length: int
     digest: str | None = None
     encoding: str | None = None
-    timestamp: datetime | None = field(
-        metadata=config(mm_field=fields.DateTime(format="iso")), default=None
-    )
+    timestamp: Optional[datetime] = Field(None)
+
+    @validator("timestamp", pre=True)
+    def parse_timestamp(cls, v: Optional[str]) -> Optional[datetime]:
+        return parse_timestamp(v)
 
 
 @dataclass
@@ -71,36 +86,30 @@ class DomainCrawl:
 # Extractor config
 
 
-@dataclass_json
-@dataclass
-class ExtractorConfig:
+class ExtractorConfig(BaseModel):
     """
     Configuration for extractor.
     """
 
     name: str
-    since: datetime | None = field(
-        metadata=config(mm_field=fields.DateTime(format="iso")), default=None
-    )
-    to: datetime | None = field(
-        metadata=config(mm_field=fields.DateTime(format="iso")), default=None
-    )
+    since: Optional[datetime] = Field(None)
+    to: Optional[datetime] = Field(None)
 
+    @validator("since", "to", pre=True)
+    def parse_timestamp(cls, v: Optional[str]) -> Optional[datetime]:
+        return parse_timestamp(v)
 
-@dataclass_json
-@dataclass
-class RoutesConfig:
+
+class RoutesConfig(BaseModel):
     """
     Configuration for extractors.
     """
 
-    regexes: list[str] = field(default_factory=list)
-    extractors: list[ExtractorConfig] = field(default_factory=list)
+    regexes: List[str] = []
+    extractors: List[ExtractorConfig] = []
 
 
-@dataclass_json
-@dataclass
-class ExtractConfig:
+class ExtractConfig(BaseModel):
     """
     Configuration for run.
     """
@@ -118,3 +127,6 @@ class MatchType(Enum):
     PREFIX = "prefix"
     HOST = "host"
     DOMAIN = "domain"
+
+    def __str__(self):
+        return self.value
diff --git a/cmoncrawl/integrations/commands.py b/cmoncrawl/integrations/commands.py
@@ -1,17 +1,18 @@
 import argparse
 import logging
-from typing import Any, Callable, Dict
+from typing import Any, Dict
 from cmoncrawl.integrations.download import add_args as add_download_args
 from cmoncrawl.integrations.extract import add_args as add_extract_args
 from cmoncrawl.common.loggers import (
     all_purpose_logger,
     metadata_logger,
+    setup_loggers,
 )
 
 
 def add_args(parser: argparse.ArgumentParser):
     parser.add_argument(
-        "--verbosity", "-v", action="count", default=0, help="Increase verbosity"
+        "--verbosity", "-v", choices=[0, 1, 2], type=int, default=1, help="Verbosity"
     )
     return parser
 
@@ -32,17 +33,6 @@ def get_args():
     return parser
 
 
-def setup_loggers(verbosity: int):
-    verbosity_cfg = {
-        0: logging.WARNING,
-        1: logging.INFO,
-        2: logging.DEBUG,
-    }
-    verbosity = min(verbosity, max(verbosity_cfg.keys()))
-    all_purpose_logger.setLevel(verbosity_cfg[verbosity])
-    metadata_logger.setLevel(verbosity_cfg[verbosity])
-
-
 def process_args(args: argparse.Namespace):
     setup_loggers(args.verbosity)
 

diff --git a/cmoncrawl/integrations/download.py b/cmoncrawl/integrations/download.py
@@ -94,13 +94,13 @@ def add_args(subparser: Any):
     parser.add_argument(
         "--match_type",
         type=MatchType,
-        choices=list(MatchType.__members__.values()),
+        choices=list(MatchType),
         help="Match type for the url, see cdx-api for more info",
     )
     parser.add_argument(
         "--max_directory_size",
         type=int,
-        default=1000,
+        default=None,
         help="Max number of files per directory",
     )
     parser.add_argument(