Skip to content

Commit

Permalink
feat(cli): add option to select maximum number of threads
Browse files Browse the repository at this point in the history
  • Loading branch information
dnlzrgz committed Sep 28, 2024
1 parent 80879e8 commit b1ecc89
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 18 deletions.
10 changes: 8 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[project]
name = "housaku"
version = "v0.3.0"
description = "A personal search engine."
version = "v0.3.1"
description = "A powerful personal search engine built on top of SQLite's FTS5."
readme = "README.md"
license = "MIT"
requires-python = ">=3.12"
Expand All @@ -16,6 +16,12 @@ dependencies = [
"pymupdf>=1.24.10",
]

[project.urls]
homepage = "https://dnlzrgz.com/projects/housaku/"
source = "https://github.com/dnlzrgz/housaku"
issues = "https://github.com/dnlzrgz/housaku/issues"
releases = "https://github.com/dnlzrgz/housaku/releases"

[project.scripts]
housaku = "housaku:cli.cli"

Expand Down
22 changes: 20 additions & 2 deletions src/housaku/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import subprocess
import urllib.parse
from functools import partial
from multiprocessing import cpu_count
from pathlib import Path
from time import perf_counter
import rich_click as click
Expand Down Expand Up @@ -55,8 +56,15 @@ def cli(ctx) -> None:
multiple=True,
help="Directory or pattern to exclude from indexing.",
)
@click.option(
"-t",
"--max-threads",
type=click.IntRange(min=1),
default=cpu_count() // 2,
help="Maximum number of threads to use for indexing (default: half of CPU cores).",
)
@click.pass_context
def index(ctx, include, exclude) -> None:
def index(ctx, include, exclude, max_threads) -> None:
"""
Index documents and posts from the specified sources in the
config.toml file.
Expand All @@ -75,13 +83,23 @@ def index(ctx, include, exclude) -> None:
"[green]Start indexing... Please, wait a moment.",
spinner="arrow",
) as status:
try:
with db_connection(settings.sqlite_url) as conn:
conn.execute("DELETE FROM documents")
console.print("[green][Ok][/] Cleaned database.")
except Exception as e:
console.print(f"[red][Err][/] Failed to clear database: {e}")
return

for dir in merged_include:
status.update(
f"[green]Indexing documents from '{dir.name}'... Please wait, this may take a moment.[/]"
)
files = list_files(dir, merged_exclude)

with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
with concurrent.futures.ThreadPoolExecutor(
max_workers=max_threads
) as executor:
executor.map(partial_function, files)

status.update(
Expand Down
27 changes: 14 additions & 13 deletions src/housaku/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import fnmatch
import mimetypes
from pathlib import Path
from typing import is_typeddict
import pymupdf
from housaku.db import db_connection
from housaku.utils import console
Expand All @@ -16,25 +17,25 @@


def list_files(root: Path, exclude: list[str] = []) -> list[Path]:
if not root.is_dir():
# TODO: just add the file if not in exclude list.
raise Exception(f"path '{root}' is not a directory")

exclude_set = set(exclude)
pending_dirs = deque([root])
files = []

while pending_dirs:
dir = pending_dirs.popleft()
for path in dir.iterdir():
if any(fnmatch.fnmatch(path.name, pattern) for pattern in exclude_set):
continue
if root.is_dir():
while pending_dirs:
dir = pending_dirs.popleft()
for path in dir.iterdir():
if any(fnmatch.fnmatch(path.name, pattern) for pattern in exclude_set):
continue

if path.is_dir():
pending_dirs.append(path)
if path.is_dir():
pending_dirs.append(path)

if path.is_file():
files.append(path.resolve())
if path.is_file():
files.append(path.resolve())
else:
if not any(fnmatch.fnmatch(root.name, pattern) for pattern in exclude_set):
files.append(root.resolve())

return files

Expand Down
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit b1ecc89

Please sign in to comment.