Skip to content

Commit

Permalink
router loads init
Browse files Browse the repository at this point in the history
  • Loading branch information
hynky1999 committed May 11, 2023
1 parent 5e3a198 commit 1f809e0
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 4 deletions.
12 changes: 10 additions & 2 deletions cmoncrawl/processor/pipeline/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,11 @@ def __init__(self, filter_non_ok: bool = True):
self.filter_non_ok = filter_non_ok

def extract_soup(self, soup: BeautifulSoup, metadata: PipeMetadata):
metadata.name = metadata.domain_record.url.replace("/", "_")[:100]
metadata.name = (
metadata.domain_record.url.replace("/", "_")[:100]
if metadata.domain_record.url is not None
else "unknown"
)
result_dict: Dict[str, Any] = {"html": str(soup)}

return result_dict
Expand Down Expand Up @@ -120,7 +124,11 @@ def __init__(self, filter_non_ok: bool = True):
self.filter_non_ok = filter_non_ok

def extract_soup(self, soup: BeautifulSoup, metadata: PipeMetadata):
metadata.name = metadata.domain_record.url.replace("/", "_")[:100]
metadata.name = (
metadata.domain_record.url.replace("/", "_")[:100]
if metadata.domain_record.url is not None
else "unknown"
)
result_dict: Dict[str, Any] = {
"domain_record": metadata.domain_record.to_dict()
}
Expand Down
10 changes: 8 additions & 2 deletions cmoncrawl/processor/pipeline/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self):
self.registered_routes: List[Route] = []
self.modules: Dict[str, IExtractor] = {}

def load_module(self, module_path: Path) -> IExtractor:
def load_module(self, module_path: Path):
module_name = os.path.splitext(os.path.basename(module_path))[0]
spec = importlib.util.spec_from_file_location(module_name, module_path)
if spec is None:
Expand All @@ -53,7 +53,10 @@ def load_module(self, module_path: Path) -> IExtractor:
raise Exception("Failed to load module: " + module_name)

spec.loader.exec_module(module)
return module, module_name

def load_module_as_extractor(self, module_path: Path):
module, module_name = self.load_module(module_path)
name: str = getattr(module, "NAME", module_name)
extractor: IExtractor | None = getattr(module, "extractor", None)
if extractor is None:
Expand All @@ -69,7 +72,10 @@ def load_modules(self, folder: Path):
if not file.endswith(".py"):
continue

extractors.append(self.load_module(Path(root) / file))
if file == "__init__.py":
self.load_module(Path(root) / file)

extractors.append(self.load_module_as_extractor(Path(root) / file))
all_purpose_logger.info(f"Loaded {len(extractors)} extractors")

def load_extractor(self, name: str, extractor: IExtractor):
Expand Down

0 comments on commit 1f809e0

Please sign in to comment.