JohnSnowLabs · danilojsl · Jan 8, 2025 · Jan 15, 2025
diff --git a/build.sbt b/build.sbt
@@ -163,7 +163,8 @@ lazy val utilDependencies = Seq(
   poiDocx
     exclude ("org.apache.logging.log4j", "log4j-api"),
   scratchpad
-    exclude ("org.apache.logging.log4j", "log4j-api")
+    exclude ("org.apache.logging.log4j", "log4j-api"),
+  pdfBox
 )
 
 lazy val typedDependencyParserDependencies = Seq(junit)

diff --git a/examples/python/reader/SparkNLP_PDF_Reader_Demo.ipynb b/examples/python/reader/SparkNLP_PDF_Reader_Demo.ipynb
@@ -0,0 +1,211 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/reader/SparkNLP_PDF_Reader_Demo.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "tzcU5p2gdak9"
+   },
+   "source": [
+    "# Introducing PDF reader in SparkNLP\n",
+    "This notebook showcases the newly added  `sparknlp.read().pdf()` method in Spark NLP that parses PDF content from both local files and distributed file systems into a Spark DataFrame."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "DczWop6QeE8F",
+    "outputId": "ceb0e598-4c62-475d-fe65-74eb7d737652"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Apache Spark version: 3.5.3\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sparknlp\n",
+    "# let's start Spark with Spark NLP\n",
+    "spark = sparknlp.start()\n",
+    "\n",
+    "print(\"Apache Spark version: {}\".format(spark.version))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "RFOFhaEedalB"
+   },
+   "source": [
+    "## Setup and Initialization\n",
+    "Let's keep in mind a few things before we start 😊\n",
+    "\n",
+    "Support for reading pdf files was introduced in Spark NLP 5.6.0 Please make sure you have upgraded to the latest Spark NLP release."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Let's install and setup Spark NLP in Google Colab\n",
+    "- This part is pretty easy via our simple script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For local files example we will download a couple of PDF files from Spark NLP Github repo:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "id": "ya8qZe00dalC"
+   },
+   "outputs": [],
+   "source": [
+    "!mkdir pdf-files\n",
+    "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/pdf-title.pdf -P pdf-files\n",
+    "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/pdf/text_3_pages.pdf -P pdf-files"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "EoFI66NAdalE"
+   },
+   "source": [
+    "## Parsing PDFs from Local Files\n",
+    "Use the `pdf()` method to parse Excel content from local directories."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "bAkMjJ1vdalE",
+    "outputId": "db995ee4-16fc-483a-eb89-c05b7cb5c863"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning::Spark Session already created, some configs may not take.\n",
+      "+--------------------+--------------------+------+--------------------+----------------+---------------+--------------------+---------+-------+\n",
+      "|                path|    modificationTime|length|                text|height_dimension|width_dimension|             content|exception|pagenum|\n",
+      "+--------------------+--------------------+------+--------------------+----------------+---------------+--------------------+---------+-------+\n",
+      "|file:/content/pdf...|2025-01-15 20:48:...| 25803|This is a Title \\...|             842|            596|[25 50 44 46 2D 3...|     NULL|      0|\n",
+      "|file:/content/pdf...|2025-01-15 20:48:...|  9487|This is a page.\\n...|             841|            595|[25 50 44 46 2D 3...|     NULL|      0|\n",
+      "+--------------------+--------------------+------+--------------------+----------------+---------------+--------------------+---------+-------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sparknlp\n",
+    "pdf_df = sparknlp.read().pdf(\"./pdf-examples\")\n",
+    "\n",
+    "pdf_df.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "VWbUgoVQrO8m",
+    "outputId": "7bbc1f6e-9188-4c42-c3fb-126198e812a5"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root\n",
+      " |-- path: string (nullable = true)\n",
+      " |-- modificationTime: timestamp (nullable = true)\n",
+      " |-- length: long (nullable = true)\n",
+      " |-- text: string (nullable = true)\n",
+      " |-- height_dimension: integer (nullable = true)\n",
+      " |-- width_dimension: integer (nullable = true)\n",
+      " |-- content: binary (nullable = true)\n",
+      " |-- exception: string (nullable = true)\n",
+      " |-- pagenum: integer (nullable = true)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "pdf_df.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "BB2FEfegGuxl"
+   },
+   "source": [
+    "You can also use DFS file systems like:\n",
+    "- Databricks: `dbfs://`\n",
+    "- HDFS: `hdfs://`\n",
+    "- Microsoft Fabric OneLake: `abfss://`"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
@@ -135,6 +135,7 @@ object Dependencies {
   val llamaCppAarch64 = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-aarch64" % llamaCppVersion
 
   val jsoupVersion = "1.18.2"
+
   val jsoup = "org.jsoup" % "jsoup" % jsoupVersion
 
   val jakartaMailVersion = "2.1.3"
@@ -146,5 +147,7 @@ object Dependencies {
   val poiDocx = "org.apache.poi" % "poi-ooxml" % poiVersion
   val scratchpad = "org.apache.poi" % "poi-scratchpad" % poiVersion
 
+  val pdfBoxVersion = "2.0.28"
+  val pdfBox = "org.apache.pdfbox" % "pdfbox" % pdfBoxVersion
   /** ------- Dependencies end  ------- */
 }
diff --git a/python/sparknlp/reader/pdf_to_text.py b/python/sparknlp/reader/pdf_to_text.py
@@ -0,0 +1,65 @@
+from pyspark import keyword_only
+from pyspark.ml.param import Param, Params, TypeConverters
+from pyspark.ml.param.shared import HasInputCol, HasOutputCol
+from pyspark.ml.util import JavaMLReadable, JavaMLWritable
+from pyspark.ml.wrapper import JavaTransformer
+
+
+class PdfToText(JavaTransformer, HasInputCol, HasOutputCol,
+                JavaMLReadable, JavaMLWritable):
+    """
+    Extract text from Pdf document to single string or to several strings per each page.
+    Input is a column with binary representation of PDF document.
+    As output generate column with text and page number.
+    Explode each page as separate row if split to page enabled.
+    """
+    pageNumCol = Param(Params._dummy(), "pageNumCol",
+                       "Page number output column name.",
+                       typeConverter=TypeConverters.toString)
+
+    partitionNum = Param(Params._dummy(), "partitionNum",
+                         "Number of partitions.",
+                         typeConverter=TypeConverters.toInt)
+
+    storeSplittedPdf = Param(Params._dummy(), "storeSplittedPdf",
+                             "Force to store splitted pdf.",
+                             typeConverter=TypeConverters.toBoolean)
+
+    @keyword_only
+    def __init__(self):
+        """
+        __init__(self)
+        """
+        super(PdfToText, self).__init__()
+        self._java_obj = self._new_java_obj("com.johnsnowlabs.reader.PdfToText", self.uid)
+
+
+    def setInputCol(self, value):
+        """
+        Sets the value of :py:attr:`inputCol`.
+        """
+        return self._set(inputCol=value)
+
+    def setOutputCol(self, value):
+        """
+        Sets the value of :py:attr:`outputCol`.
+        """
+        return self._set(outputCol=value)
+
+    def setPageNumCol(self, value):
+        """
+        Sets the value of :py:attr:`pageNumCol`.
+        """
+        return self._set(pageNumCol=value)
+
+    def setPartitionNum(self, value):
+        """
+        Sets the value of :py:attr:`partitionNum`.
+        """
+        return self._set(partitionNum=value)
+
+    def setStoreSplittedPdf(self, value):
+        """
+        Sets the value of :py:attr:`storeSplittedPdf`.
+        """
+        return self._set(storeSplittedPdf=value)
diff --git a/python/sparknlp/reader/sparknlp_reader.py b/python/sparknlp/reader/sparknlp_reader.py
@@ -91,6 +91,40 @@ class SparkNLPReader(ExtendedJavaWrapper):
      |    |    |    |-- key: string
      |    |    |    |-- value: string (valueContainsNull = true)
 
+
+    Instantiates class to read PDF files.
+
+    pdfPath: this is a path to a directory of PDF files or a path to an PDF file E.g.
+    "path/pdfs/"
+
+    Examples
+    --------
+    >>> from sparknlp.reader import SparkNLPReader
+    >>> pdf_df = SparkNLPReader().pdf(spark, "home/user/pdfs-directory")
+
+    You can use SparkNLP for one line of code
+    >>> import sparknlp
+    >>> pdf_df = sparknlp.read().pdf("home/user/pdfs-directory")
+    >>> pdf_df.show(truncate=False)
+
+    +--------------------+--------------------+------+--------------------+----------------+---------------+--------------------+---------+-------+
+    |                path|    modificationTime|length|                text|height_dimension|width_dimension|             content|exception|pagenum|
+    +--------------------+--------------------+------+--------------------+----------------+---------------+--------------------+---------+-------+
+    |file:/content/pdf...|2025-01-15 20:48:...| 25803|This is a Title \...|             842|            596|[25 50 44 46 2D 3...|     NULL|      0|
+    |file:/content/pdf...|2025-01-15 20:48:...|  9487|This is a page.\n...|             841|            595|[25 50 44 46 2D 3...|     NULL|      0|
+    +--------------------+--------------------+------+--------------------+----------------+---------------+--------------------+---------+-------+
+
+    pdf_df.printSchema()
+    root
+     |-- path: string (nullable = true)
+     |-- modificationTime: timestamp (nullable = true)
+     |-- length: long (nullable = true)
+     |-- text: string (nullable = true)
+     |-- height_dimension: integer (nullable = true)
+     |-- width_dimension: integer (nullable = true)
+     |-- content: binary (nullable = true)
+     |-- exception: string (nullable = true)
+     |-- pagenum: integer (nullable = true)
     """
 
     def __init__(self, spark, params=None):
@@ -118,4 +152,11 @@ def doc(self, docPath):
             raise TypeError("docPath must be a string")
         jdf = self._java_obj.doc(docPath)
         dataframe = self.getDataFrame(self.spark, jdf)
+        return dataframe
+
+    def pdf(self, pdfPath):
+        if not isinstance(pdfPath, str):
+            raise TypeError("docPath must be a string")
+        jdf = self._java_obj.pdf(pdfPath)
+        dataframe = self.getDataFrame(self.spark, jdf)
         return dataframe