Skip to content

Commit

Permalink
Merge pull request #1031 from IBM/docling-2.21
Browse files Browse the repository at this point in the history
Upgrade Docling to v2.21
  • Loading branch information
touma-I authored Feb 10, 2025
2 parents c5952c5 + 26a41d3 commit 2dca254
Show file tree
Hide file tree
Showing 17 changed files with 62 additions and 70 deletions.
4 changes: 2 additions & 2 deletions transforms/language/doc_chunk/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
docling-core==2.3.0
pydantic>=2.0.0,<2.10.0
docling-core==2.18.0
pydantic>=2.0.0
llama-index-core>=0.11.22,<0.12.0
21 changes: 11 additions & 10 deletions transforms/language/doc_chunk/test-data/expected/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "doc_chunk",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-10-30 18:38:40",
"end_time": "2024-10-30 18:38:40",
"start_time": "2025-02-10 15:20:06",
"end_time": "2025-02-10 15:20:07",
"status": "success"
},
"code": {
Expand All @@ -25,6 +25,7 @@
"output_bbox_column_name": "bbox",
"chunk_size_tokens": 128,
"chunk_overlap_tokens": 30,
"dl_min_chunk_len": null,
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
Expand All @@ -34,29 +35,29 @@
"num_processors": 0
},
"execution_stats": {
"cpus": 19.5,
"cpus": 25.8,
"gpus": 0,
"memory": 27.48,
"memory": 24.41,
"object_store": 0,
"execution time, min": 0.001
},
"job_output_stats": {
"source_files": 1,
"source_size": 12073,
"result_files": 1,
"result_size": 14363,
"processing_time": 0.043,
"result_size": 16705,
"processing_time": 0.044,
"nfiles": 1,
"nrows": 39,
"nrows": 29,
"source_doc_count": 1,
"result_doc_count": 39
"result_doc_count": 29
},
"source": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/test-data/input",
"name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/test-data/input",
"type": "path"
},
"target": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/python/output",
"name": "/Users/dol/codes/data-prep-kit/transforms/language/doc_chunk/output",
"type": "path"
}
}
Binary file modified transforms/language/doc_chunk/test-data/expected/test1.parquet
Binary file not shown.
5 changes: 2 additions & 3 deletions transforms/language/pdf2parquet/Dockerfile.python
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,10 @@ RUN pip install ${PIP_INSTALL_EXTRA_ARGS} -r requirements.txt

# Set environment
ENV PYTHONPATH /home/dpk
ENV PATH="/home/dpk/.local/bin:${PATH}"

# Download models
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; s=StandardPdfPipeline.download_models_hf(); print(f"Models cached in {s}")'

RUN docling-tools models download layout tableformer picture_classifier easyocr

# Parallelism
ENV OMP_NUM_THREADS=2
Expand Down
11 changes: 4 additions & 7 deletions transforms/language/pdf2parquet/Dockerfile.ray
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,12 @@ COPY --chmod=775 --chown=ray:root dpk_pdf2parquet/ dpk_pdf2parquet/
COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt
RUN pip install ${PIP_INSTALL_EXTRA_ARGS} -r requirements.txt



# Download models
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
# RUN python -c 'from docling.document_converter import DocumentConverter; from pathlib import Path; DocumentConverter.download_models_hf(local_dir=Path("./artifacts/"));'
RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; s=StandardPdfPipeline.download_models_hf(); print(f"Models cached in {s}")'

# Set environment
ENV PYTHONPATH /home/ray
ENV PATH="/home/ray/.local/bin:${PATH}"

# Download models
RUN docling-tools models download layout tableformer picture_classifier easyocr

# Parallelism
ENV OMP_NUM_THREADS=2
Expand Down
9 changes: 5 additions & 4 deletions transforms/language/pdf2parquet/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
docling-core==2.3.0
docling-ibm-models==2.0.3
deepsearch-glm==0.26.1
docling==2.3.1
docling-core==2.18.0
docling-ibm-models==3.3.1
docling-parse==3.3.0
deepsearch-glm==1.0.0
docling==2.21.0
filetype >=1.2.0, <2.0.0
Binary file not shown.
28 changes: 11 additions & 17 deletions transforms/language/pdf2parquet/test-data/expected/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,11 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-11-13 08:35:51",
"end_time": "2024-11-13 08:36:23",
"start_time": "2025-02-10 14:18:13",
"end_time": "2025-02-10 14:18:21",
"status": "success"
},
"code": {
"github": "github",
"commit_hash": "12345",
"path": "path"
},
"code": null,
"job_input_params": {
"batch_size": -1,
"artifacts_path": null,
Expand All @@ -23,42 +19,40 @@
"ocr_engine": "easyocr",
"bitmap_area_threshold": 0.05,
"pdf_backend": "dlparse_v2",
"double_precision": 0,
"double_precision": 8,
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [
".pdf",
".docx",
".pptx",
".zip"
],
"num_processors": 0
},
"execution_stats": {
"cpus": 147.5,
"cpus": 23.6,
"gpus": 0,
"memory": 33.72,
"memory": 29.99,
"object_store": 0,
"execution time, min": 0.522
"execution time, min": 0.127
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 33078,
"processing_time": 4.221,
"result_size": 32765,
"processing_time": 3.93,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
"nskip": 0
},
"source": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input",
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/test-data/input",
"type": "path"
},
"target": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/output",
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/output",
"type": "path"
}
}
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-11-13 08:37:05",
"end_time": "2024-11-13 08:37:11",
"start_time": "2025-02-10 14:45:21",
"end_time": "2025-02-10 14:45:28",
"status": "success"
},
"code": {
Expand Down Expand Up @@ -36,29 +36,29 @@
"num_processors": 0
},
"execution_stats": {
"cpus": 143.9,
"cpus": 28.6,
"gpus": 0,
"memory": 34.21,
"memory": 24.32,
"object_store": 0,
"execution time, min": 0.1
"execution time, min": 0.113
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 1,
"processing_time": 3.364,
"processing_time": 3.426,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
"nskip": 0,
"result_size": 27226
"result_size": 26903
},
"source": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input",
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/test-data/input",
"type": "path"
},
"target": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/output",
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/output",
"type": "path"
}
}
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-11-13 08:37:56",
"end_time": "2024-11-13 08:38:02",
"start_time": "2025-02-10 14:44:43",
"end_time": "2025-02-10 14:44:50",
"status": "success"
},
"code": {
Expand Down Expand Up @@ -36,29 +36,29 @@
"num_processors": 0
},
"execution_stats": {
"cpus": 142.2,
"cpus": 28.5,
"gpus": 0,
"memory": 33.63,
"memory": 24.53,
"object_store": 0,
"execution time, min": 0.1
"execution time, min": 0.107
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 22993,
"processing_time": 3.422,
"result_size": 23484,
"processing_time": 3.518,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
"nskip": 0
},
"source": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input",
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/test-data/input",
"type": "path"
},
"target": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/output",
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/output",
"type": "path"
}
}
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-11-13 08:37:31",
"end_time": "2024-11-13 08:37:34",
"start_time": "2025-02-10 14:44:09",
"end_time": "2025-02-10 14:44:11",
"status": "success"
},
"code": {
Expand Down Expand Up @@ -36,29 +36,29 @@
"num_processors": 0
},
"execution_stats": {
"cpus": 143.4,
"cpus": 28.8,
"gpus": 0,
"memory": 31.51,
"memory": 22.7,
"object_store": 0,
"execution time, min": 0.042
"execution time, min": 0.038
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 29694,
"processing_time": 2.077,
"result_size": 29781,
"processing_time": 1.506,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
"nskip": 0
},
"source": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input",
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/test-data/input",
"type": "path"
},
"target": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/output",
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/output",
"type": "path"
}
}
Binary file not shown.

0 comments on commit 2dca254

Please sign in to comment.