getomni-ai · annapo23 · Mar 5, 2025 · Mar 5, 2025
diff --git a/node-zerox/src/index.ts b/node-zerox/src/index.ts
@@ -17,6 +17,7 @@ import {
   extractPagesFromStructuredDataFile,
   getTesseractScheduler,
   isCompletionResponse,
+  isImageEmpty,
   isStructuredDataFile,
   prepareWorkersForImageProcessing,
   runRetries,
@@ -25,9 +26,7 @@ import {
 } from "./utils";
 import { createModel } from "./models";
 import {
-  CompletionResponse,
   ErrorMode,
-  ExtractionResponse,
   ModelOptions,
   ModelProvider,
   OperationMode,
@@ -107,19 +106,15 @@ export const zerox = async ({
 
   if (extractOnly) directImageExtraction = true;
 
-  let scheduler: Tesseract.Scheduler | null = null;
-  // Add initial tesseract workers if we need to correct orientation
-  if (correctOrientation) {
-    scheduler = await getTesseractScheduler();
-    const workerCount =
-      maxTesseractWorkers !== -1 && maxTesseractWorkers < NUM_STARTING_WORKERS
-        ? maxTesseractWorkers
-        : NUM_STARTING_WORKERS;
-    await addWorkersToTesseractScheduler({
-      numWorkers: workerCount,
-      scheduler,
-    });
-  }
+  let scheduler: Tesseract.Scheduler = await getTesseractScheduler();
+  const workerCount =
+    maxTesseractWorkers !== -1 && maxTesseractWorkers < NUM_STARTING_WORKERS
+      ? maxTesseractWorkers
+      : NUM_STARTING_WORKERS;
+  await addWorkersToTesseractScheduler({
+    numWorkers: workerCount,
+    scheduler,
+  });
 
   try {
     // Ensure temp directory exists + create temp folder
@@ -207,13 +202,11 @@ export const zerox = async ({
         imagePaths = await Promise.all(compressPromises);
       }
 
-      if (correctOrientation) {
-        await prepareWorkersForImageProcessing({
-          maxTesseractWorkers,
-          numImages: imagePaths.length,
-          scheduler,
-        });
-      }
+      await prepareWorkersForImageProcessing({
+        maxTesseractWorkers,
+        numImages: imagePaths.length,
+        scheduler,
+      });
 
       // Start processing OCR using LLM
       const modelInstance = createModel({
@@ -239,49 +232,56 @@ export const zerox = async ({
 
           let page: Page;
           try {
-            let rawResponse: CompletionResponse | ExtractionResponse;
-            if (customModelFunction) {
-              rawResponse = await runRetries(
+            const [isEmpty, rawResponse] = await Promise.all([
+              isImageEmpty({ imageBuffer: correctedBuffer, scheduler }),
+              runRetries(
                 () =>
-                  customModelFunction({
-                    buffer: correctedBuffer,
-                    image: imagePath,
-                    maintainFormat,
-                    priorPage,
-                  }),
+                  customModelFunction
+                    ? customModelFunction({
+                        buffer: correctedBuffer,
+                        image: imagePath,
+                        maintainFormat,
+                        priorPage,
+                      })
+                    : modelInstance.getCompletion(OperationMode.OCR, {
+                        image: correctedBuffer,
+                        maintainFormat,
+                        priorPage,
+                        prompt,
+                      }),
                 maxRetries,
                 pageNumber
-              );
+              ),
+            ]);
+
+            if (isEmpty) {
+              page = {
+                content: "",
+                contentLength: 0,
+                inputTokens: 0,
+                outputTokens: 0,
+                page: pageNumber,
+                status: PageStatus.SUCCESS,
+              };
             } else {
-              rawResponse = await runRetries(
-                () =>
-                  modelInstance.getCompletion(OperationMode.OCR, {
-                    image: correctedBuffer,
-                    maintainFormat,
-                    priorPage,
-                    prompt,
-                  }),
-                maxRetries,
-                pageNumber
+              const response = CompletionProcessor.process(
+                OperationMode.OCR,
+                rawResponse
               );
-            }
-            const response = CompletionProcessor.process(
-              OperationMode.OCR,
-              rawResponse
-            );
 
-            inputTokenCount += response.inputTokens;
-            outputTokenCount += response.outputTokens;
+              inputTokenCount += response.inputTokens;
+              outputTokenCount += response.outputTokens;
 
-            if (isCompletionResponse(OperationMode.OCR, response)) {
-              priorPage = response.content;
-            }
+              if (isCompletionResponse(OperationMode.OCR, response)) {
+                priorPage = response.content;
+              }
 
-            page = {
-              ...response,
-              page: pageNumber,
-              status: PageStatus.SUCCESS,
-            };
+              page = {
+                ...response,
+                page: pageNumber,
+                status: PageStatus.SUCCESS,
+              };
+            }
             numSuccessfulOCRRequests++;
           } catch (error) {
             console.error(`Failed to process image ${imagePath}:`, error);

diff --git a/node-zerox/src/utils/image.ts b/node-zerox/src/utils/image.ts
@@ -25,7 +25,7 @@ export const cleanupImage = async ({
     image.trim();
   }
 
-  // scheduler would always be non-null if correctOrientation is true
+  // Scheduler would always be non-null if correctOrientation is true
   // Adding this check to satisfy typescript
   if (correctOrientation && scheduler) {
     const optimalRotation = await determineOptimalRotation({
@@ -43,8 +43,13 @@ export const cleanupImage = async ({
   return correctedBuffer;
 };
 
-// Determine the optimal image orientation based on OCR confidence
-// Run Tesseract on 4 image orientations and compare the outputs
+/**
+ * Determine the optimal image orientation based on OCR confidence
+ * Runs Tesseract on 4 image orientations and compares the outputs
+ * @param image - The image to analyze
+ * @param scheduler - The Tesseract scheduler for OCR operations
+ * @returns The degrees to rotate the image
+ */
 const determineOptimalRotation = async ({
   image,
   scheduler,
@@ -109,3 +114,22 @@ export const compressImage = async (
     return image;
   }
 };
+
+/**
+ * Checks if an image contains meaningful text using OCR
+ * @param imageBuffer - The image buffer to analyze
+ * @param scheduler - The Tesseract scheduler for OCR operations
+ * @returns True if the image is empty, otherwise false
+ */
+export const isImageEmpty = async ({
+  imageBuffer,
+  scheduler,
+}: {
+  imageBuffer: Buffer;
+  scheduler: Tesseract.Scheduler;
+}): Promise<boolean> => {
+  const {
+    data: { text },
+  } = await scheduler.addJob("recognize", imageBuffer);
+  return !text.trim();
+};