Skip to content

Commit

Permalink
use new models download CLI
Browse files Browse the repository at this point in the history
Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm committed Feb 10, 2025
1 parent cafe758 commit 26a41d3
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 10 deletions.
5 changes: 2 additions & 3 deletions transforms/language/pdf2parquet/Dockerfile.python
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,10 @@ RUN pip install ${PIP_INSTALL_EXTRA_ARGS} -r requirements.txt

# Set environment
ENV PYTHONPATH /home/dpk
ENV PATH="/home/dpk/.local/bin:${PATH}"

# Download models
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; s=StandardPdfPipeline.download_models_hf(); print(f"Models cached in {s}")'

RUN docling-tools models download layout tableformer picture_classifier easyocr

# Parallelism
ENV OMP_NUM_THREADS=2
Expand Down
11 changes: 4 additions & 7 deletions transforms/language/pdf2parquet/Dockerfile.ray
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,12 @@ COPY --chmod=775 --chown=ray:root dpk_pdf2parquet/ dpk_pdf2parquet/
COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt
RUN pip install ${PIP_INSTALL_EXTRA_ARGS} -r requirements.txt



# Download models
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
# RUN python -c 'from docling.document_converter import DocumentConverter; from pathlib import Path; DocumentConverter.download_models_hf(local_dir=Path("./artifacts/"));'
RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; s=StandardPdfPipeline.download_models_hf(); print(f"Models cached in {s}")'

# Set environment
ENV PYTHONPATH /home/ray
ENV PATH="/home/ray/.local/bin:${PATH}"

# Download models
RUN docling-tools models download layout tableformer picture_classifier easyocr

# Parallelism
ENV OMP_NUM_THREADS=2
Expand Down

0 comments on commit 26a41d3

Please sign in to comment.