diff --git a/.vscode/settings.json b/.vscode/settings.json index 96d2c1f9..cfef7af0 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,6 @@ { "html.format.wrapLineLength": 80, - "python.formatting.provider": "black", + "python.formatting.provider": "none", "python.linting.enabled": true, "python.testing.unittestArgs": [ "-v", @@ -13,5 +13,8 @@ "python.testing.unittestEnabled": true, "python.analysis.typeCheckingMode": "strict", "python.linting.mypyPath": "/usr/bin/mypy", - "cSpell.enabled": false + "cSpell.enabled": false, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter" + } } diff --git a/README.md b/README.md index f1482191..c570de2d 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,18 @@ Unlike all other commoncrawl extractors, this project allows creation of custom extractors with high level of modularity. Unlike getting records from CmonCrawl index using Amazon's Athena this solution is completely free of cost :) +### Installation +#### From PyPi +```bash +$ pip install cmoncrawl +``` +#### From source +```bash +$ git clone https://github.com/hynky1999/CmonCrawl +$ cd CmonCrawl +$ pip install -r requirements.txt +$ pip install -e . +``` ### Usage @@ -14,7 +26,7 @@ To create them you need an example html files you want to extract. You can use the following command to get html files from the CommonCrawl dataset: ```bash -$ cmon download --match_type=domain --limit=1000 example.com html_output html +$ cmon download --match_type=domain --limit=100 example.com html_output html ``` This will download a first 100 html files from example.com and save them in html_output. @@ -24,6 +36,8 @@ Once you have your the files to extract, you can create your extractor. To do so, you need to create a new python file e.g my_extractor.py in extractors directory and add the following code: ```python +from bs4 import BeautifulSoup +from cmoncrawl.common.types import PipeMetadata from cmoncrawl.processor.pipeline.extractor import BaseExtractor class MyExtractor(BaseExtractor): def __init__(self): @@ -33,6 +47,12 @@ class MyExtractor(BaseExtractor): def extract_soup(self, soup: BeautifulSoup, metadata: PipeMetadata): # here you can extract the data you want from the soup # and return a dict with the data you want to save + body = soup.select_one("body") + if body is None: + return None + return { + "body": body.get_text() + } # You can also override the following methods to drop the files you don't want to extracti # Return True to keep the file, False to drop it @@ -62,8 +82,8 @@ In our case the config would look like this: # You can use since and to choose the extractor based on the date of the crawl # You can ommit either of them - "since": "2009-01-01T00:00:00+00:00", - "to": "2009-01-01T00:00:00+00:00" + "since": "2009-01-01", + "to": "2025-01-01" }] }, # More routes here @@ -75,7 +95,7 @@ In our case the config would look like this: To test the extraction, you can use the following command: ```bash -$ cmon extract config.json extracted_output html_output/*/*.html html +$ cmon extract config.json extracted_output html_output/*.html html ``` ### Crawl the sites @@ -95,12 +115,13 @@ This will download the first 100000 records from example.com and save them in dr Once you have the records, you can use the following command to extract them: ```bash -$ cmon extract --n_proc=4 config.json extracted_output dr_output/*/*.jsonl record +$ cmon extract --n_proc=4 config.json extracted_output dr_output/*.jsonl record ``` Note that you can use the --n_proc option to specify the number of processes to use for the extraction. Multiprocessing is done on file level, so if you have just one file it will not be used. - +### Other examples +For other examples see [examples](https://github.com/hynky1999/CmonCrawl/tree/main/examples) ### Advanced usage The whole project was written with modularity in mind. That means that you can adjust the framework to your needs. To know more check see [documentation](https://hynky1999.github.io/CmonCrawl/) diff --git a/cmoncrawl/integrations/extract.py b/cmoncrawl/integrations/extract.py index f09c6205..a3aeb1a8 100644 --- a/cmoncrawl/integrations/extract.py +++ b/cmoncrawl/integrations/extract.py @@ -209,7 +209,6 @@ def run_extract(args: argparse.Namespace): _extract_task, [ ( - # [metadata_logger.level, all_purpose_logger.level], args.output_path / f"{file.stem}" if args.n_proc != 1 else args.output_path,