NVIDIA · edknv · Nov 14, 2024 · Nov 14, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/client/src/nv_ingest_client/nv_ingest_cli.py b/client/src/nv_ingest_client/nv_ingest_cli.py
@@ -116,7 +116,7 @@
 Example:
   --task 'split:{"split_by":"page", "split_length":10}'
   --task 'extract:{"document_type":"pdf", "extract_text":true}'
-  --task 'extract:{"document_type":"pdf", "extract_method":"doughnut"}'
+  --task 'extract:{"document_type":"pdf", "extract_method":"nemoretriever_parse"}'
   --task 'extract:{"document_type":"pdf", "extract_method":"unstructured_io"}'
   --task 'extract:{"document_type":"docx", "extract_text":true, "extract_images":true}'
   --task 'store:{"content_type":"image", "store_method":"minio", "endpoint":"minio:9000"}'

diff --git a/client/src/nv_ingest_client/primitives/tasks/extract.py b/client/src/nv_ingest_client/primitives/tasks/extract.py
@@ -19,10 +19,6 @@
 
 logger = logging.getLogger(__name__)
 
-DOUGHNUT_TRITON_HOST = os.environ.get("DOUGHNUT_TRITON_HOST", "localhost")
-DOUGHNUT_TRITON_PORT = os.environ.get("DOUGHNUT_TRITON_PORT", "8001")
-DOUGHNUT_BATCH_SIZE = os.environ.get("DOUGHNUT_TRITON_PORT", "16")
-
 UNSTRUCTURED_API_KEY = os.environ.get("UNSTRUCTURED_API_KEY", None)
 UNSTRUCTURED_URL = os.environ.get("UNSTRUCTURED_URL", "https://api.unstructured.io/general/v0/general")
 UNSTRUCTURED_STRATEGY = os.environ.get("UNSTRUCTURED_STRATEGY", "auto")
@@ -49,7 +45,7 @@
 
 _Type_Extract_Method_PDF = Literal[
     "adobe",
-    "doughnut",
+    "nemoretriever_parse",
     "haystack",
     "llama_parse",
     "pdfium",
@@ -74,7 +70,7 @@
     "tiff": get_args(_Type_Extract_Method_Image),
 }
 
-_Type_Extract_Tables_Method_PDF = Literal["yolox", "pdfium"]
+_Type_Extract_Tables_Method_PDF = Literal["yolox", "pdfium", "nemoretriever_parse"]
 
 _Type_Extract_Tables_Method_DOCX = Literal["python_docx",]
 
@@ -238,13 +234,6 @@ def to_dict(self) -> Dict:
                 "unstructured_url": "",  # TODO(Devin): Should be an environment variable
             }
             task_properties["params"].update(unstructured_properties)
-        elif self._extract_method == "doughnut":
-            doughnut_properties = {
-                "doughnut_triton_host": os.environ.get("DOUGHNUT_TRITON_HOST", DOUGHNUT_TRITON_HOST),
-                "doughnut_triton_port": os.environ.get("DOUGHNUT_TRITON_PORT", DOUGHNUT_TRITON_PORT),
-                "doughnut_batch_size": os.environ.get("DOUGHNUT_BATCH_SIZE", DOUGHNUT_BATCH_SIZE),
-            }
-            task_properties["params"].update(doughnut_properties)
         elif self._extract_method == "unstructured_io":
             unstructured_properties = {
                 "unstructured_api_key": os.environ.get("UNSTRUCTURED_API_KEY", UNSTRUCTURED_API_KEY),

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -125,6 +125,27 @@ services:
               capabilities: [gpu]
     runtime: nvidia
 
+  nemoretriever-parse:
+    image: ${NEMORETRIEVER_PARSE_IMAGE:-nvcr.io/nvidia/nemo-microservices/nemoretriever-parse}:${NEMORETRIEVER_PARSE_TAG:-1.2.0ea}
+    ports:
+      - "8015:8000"
+      - "8016:8001"
+      - "8017:8002"
+    user: root
+    environment:
+      - NIM_HTTP_API_PORT=8000
+      - NIM_TRITON_LOG_VERBOSE=1
+      - NGC_API_KEY=${NIM_NGC_API_KEY:-${NGC_API_KEY:-ngcapikey}}
+      - CUDA_VISIBLE_DEVICES=0
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["0"]
+              capabilities: [gpu]
+    runtime: nvidia
+
   nv-ingest-ms-runtime:
     image: nvcr.io/nvidia/nemo-microservices/nv-ingest:24.12
     build:
@@ -155,7 +176,6 @@ services:
       # build.nvidia.com hosted deplot
       #- DEPLOT_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/vlm/google/deplot
       - DEPLOT_INFER_PROTOCOL=http
-      - DOUGHNUT_GRPC_TRITON=triton-doughnut:8001
       - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-nvidia/nv-embedqa-e5-v5}
       - INGEST_LOG_LEVEL=DEFAULT
       # Message client for development
@@ -168,6 +188,10 @@ services:
       - MESSAGE_CLIENT_TYPE=redis
       - MINIO_BUCKET=${MINIO_BUCKET:-nv-ingest}
       - MRC_IGNORE_NUMA_CHECK=1
+      # build.nvidia.com hosted nemoretriever-parse
+      #- NEMORETRIEVER_PARSE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/vlm/nvidia/nemoretriever-parse
+      - NEMORETRIEVER_PARSE_HTTP_ENDPOINT=http://nemoretriever-parse:8000/v1/chat/completions
+      - NEMORETRIEVER_PARSE_INFER_PROTOCOL=http
       - NGC_API_KEY=${NGC_API_KEY:-ngcapikey}
       - NVIDIA_BUILD_API_KEY=${NVIDIA_BUILD_API_KEY:-${NGC_API_KEY:-ngcapikey}}
       - OTEL_EXPORTER_OTLP_ENDPOINT=otel-collector:4317

diff --git a/docs/docs/assets/images/doughnut_batch_dize.png b/docs/docs/assets/images/doughnut_batch_dize.png
diff --git a/docs/docs/user-guide/developer-guide/deployment.md b/docs/docs/user-guide/developer-guide/deployment.md
@@ -25,10 +25,6 @@ docker compose up -d otel-collector prometheus grafana zipkin
 # The `embed` task will not be functional without this service.
 docker compose up -d embedding
 
-# Optional (Triton) See below for Triton setup we need Triton for any model inference
-# This is only needed for captioning or DOUGHNUT based extraction.
-docker compose up -d triton
-
 # Ingest service
 docker compose up -d nv-ingest-ms-runtime
 ```

diff --git a/docs/docs/user-guide/developer-guide/environment-config.md b/docs/docs/user-guide/developer-guide/environment-config.md
@@ -7,8 +7,6 @@ The following are the environment configuration variables that you can specify i
 |----------------------------------|--------------------------------|-----------------------------------------------------------------------|
 | `CAPTION_CLASSIFIER_GRPC_TRITON` | - `triton:8001` <br/>                                      | The endpoint where the caption classifier model is hosted using gRPC for communication. This is used to send requests for caption classification. You must specify only ONE of an http or gRPC endpoint. If both are specified gRPC will take precedence. |
 | `CAPTION_CLASSIFIER_MODEL_NAME`  | - `deberta_large` <br/>                                    | The name of the caption classifier model. |
-| `DOUGHNUT_TRITON_HOST`           | - `triton-doughnut` <br/>                                  | The hostname or IP address of the DOUGHNUT model service. |
-| `DOUGHNUT_TRITON_PORT`           | - `8001` <br/>                                             | The port number on which the DOUGHNUT model service is listening. |
 | `INGEST_LOG_LEVEL`               | - `DEBUG` <br/> - `INFO` <br/> - `WARNING` <br/> - `ERROR` <br/> - `CRITICAL` <br/> | The log level for the ingest service, which controls the verbosity of the logging output. |
 | `MESSAGE_CLIENT_HOST`            | - `redis` <br/> - `localhost` <br/> - `192.168.1.10` <br/> | Specifies the hostname or IP address of the message broker used for communication between services. |
 | `MESSAGE_CLIENT_PORT`            | - `7670` <br/> - `6379` <br/>                              | Specifies the port number on which the message broker is listening. |

diff --git a/docs/docs/user-guide/developer-guide/nv-ingest_cli.md b/docs/docs/user-guide/developer-guide/nv-ingest_cli.md
@@ -32,7 +32,7 @@ Options:
                                   Example:
                                     --task 'split:{"split_by":"page", "split_length":10}'
                                     --task 'extract:{"document_type":"pdf", "extract_text":true}'
-                                    --task 'extract:{"document_type":"pdf", "extract_method":"doughnut"}'
+                                    --task 'extract:{"document_type":"pdf", "extract_method":"nemoretriever_parse"}'
                                     --task 'extract:{"document_type":"pdf", "extract_method":"unstructured_io"}'
                                     --task 'extract:{"document_type":"docx", "extract_text":true, "extract_images":true}'
                                     --task 'store:{"content_type":"image", "store_method":"minio", "endpoint":"minio:9000"}'
@@ -120,7 +120,7 @@ nv-ingest-cli \
 
 Submit a PDF file with splitting and extraction tasks.
 
-**Note: (TODO)** This currently only works for pdfium, doughnut, and Unstructured.io; haystack, Adobe, and LlamaParse
+**Note: (TODO)** This currently only works for pdfium, nemoretriever_parse, and Unstructured.io; haystack, Adobe, and LlamaParse
 have existing workflows but have not been fully converted to use our unified metadata schema.
 
 ```bash

diff --git a/src/nv_ingest/extraction_workflows/image/image_handlers.py b/src/nv_ingest/extraction_workflows/image/image_handlers.py
@@ -30,9 +30,9 @@
 from wand.image import Image as WandImage
 
 import nv_ingest.util.nim.yolox as yolox_utils
-from nv_ingest.extraction_workflows.pdf.doughnut_utils import crop_image
 from nv_ingest.schemas.image_extractor_schema import ImageConfigSchema
 from nv_ingest.schemas.metadata_schema import AccessLevelEnum
+from nv_ingest.util.image_processing.transforms import crop_image
 from nv_ingest.util.image_processing.transforms import numpy_to_base64
 from nv_ingest.util.nim.helpers import create_inference_client
 from nv_ingest.util.pdf.metadata_aggregators import CroppedImageWithContent
@@ -160,7 +160,8 @@ def extract_table_and_chart_images(
             *bbox, _ = bboxes
             h1, w1, h2, w2 = np.array(bbox) * np.array([height, width, height, width])
 
-            base64_img = crop_image(original_image, (int(h1), int(w1), int(h2), int(w2)))
+            cropped_img = crop_image(original_image, (int(h1), int(w1), int(h2), int(w2)))
+            base64_img = numpy_to_base64(cropped_img) if cropped_img is not None else None
 
             table_data = CroppedImageWithContent(
                 content="",

diff --git a/src/nv_ingest/extraction_workflows/pdf/__init__.py b/src/nv_ingest/extraction_workflows/pdf/__init__.py
@@ -4,7 +4,7 @@
 
 
 from nv_ingest.extraction_workflows.pdf.adobe_helper import adobe
-from nv_ingest.extraction_workflows.pdf.doughnut_helper import doughnut
+from nv_ingest.extraction_workflows.pdf.nemoretriever_parse_helper import nemoretriever_parse
 from nv_ingest.extraction_workflows.pdf.llama_parse_helper import llama_parse
 from nv_ingest.extraction_workflows.pdf.pdfium_helper import pdfium_extractor as pdfium
 from nv_ingest.extraction_workflows.pdf.tika_helper import tika
@@ -15,6 +15,6 @@
     "pdfium",
     "tika",
     "unstructured_io",
-    "doughnut",
+    "nemoretriever_parse",
     "adobe",
 ]