diff --git a/examples/python/reader/SparkNLP_Excel_Reader_Demo.ipynb b/examples/python/reader/SparkNLP_Excel_Reader_Demo.ipynb new file mode 100644 index 00000000000000..6bd5c714dbc971 --- /dev/null +++ b/examples/python/reader/SparkNLP_Excel_Reader_Demo.ipynb @@ -0,0 +1,408 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/reader/SparkNLP_Excel_Reader_Demo.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tzcU5p2gdak9" + }, + "source": [ + "# Introducing Excel reader in SparkNLP\n", + "This notebook showcases the newly added `sparknlp.read().xls()` method in Spark NLP that parses Excel content from both local files and both local and distributed file systems into a Spark DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xrvHhiTAdfGd", + "outputId": "77803c7f-1033-4f0c-dda4-a818986367e5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "mjV3NcQ8eA52" + }, + "outputs": [], + "source": [ + "!cp drive/MyDrive/JSL/sparknlp/sparknlp.jar .\n", + "!cp drive/MyDrive/JSL/sparknlp/spark_nlp-5.5.1-py2.py3-none-any.whl ." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FuWVW6HPXRQw", + "outputId": "fd3b80c5-4bf9-4d74-ac2e-8100937b71e9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: PYSPARK=3.4.0\n" + ] + } + ], + "source": [ + "%env PYSPARK=3.4.0" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pEmutNjReCgc", + "outputId": "7cb8d345-719d-4a71-d91d-57f9eb1b2b85" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: PYSPARK in /usr/local/lib/python3.10/dist-packages (3.5.3)\n", + "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from PYSPARK) (0.10.9.7)\n" + ] + } + ], + "source": [ + "!pip install PYSPARK" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3qjPeDjvfCpA", + "outputId": "b7cb29be-3052-4be8-a94d-ad3b9777e926" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing ./spark_nlp-5.5.1-py2.py3-none-any.whl\n", + "Installing collected packages: spark-nlp\n", + "Successfully installed spark-nlp-5.5.1\n" + ] + } + ], + "source": [ + "!pip install spark_nlp-5.5.1-py2.py3-none-any.whl" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DczWop6QeE8F", + "outputId": "610a531b-ad06-48c5-e868-7906ee6bef1d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.5.3\n" + ] + } + ], + "source": [ + "# import sparknlp\n", + "# # let's start Spark with Spark NLP\n", + "# spark = sparknlp.start()\n", + "\n", + "from pyspark.sql import SparkSession\n", + "\n", + "spark = SparkSession.builder \\\n", + " .appName(\"SparkNLP\") \\\n", + " .master(\"local[*]\") \\\n", + " .config(\"spark.driver.memory\", \"12G\") \\\n", + " .config(\"spark.serializer\", \"org.apache.spark.serializer.KryoSerializer\") \\\n", + " .config(\"spark.kryoserializer.buffer.max\", \"2000M\") \\\n", + " .config(\"spark.driver.maxResultSize\", \"0\") \\\n", + " .config(\"spark.jars\", \"./sparknlp.jar\") \\\n", + " .getOrCreate()\n", + "\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RFOFhaEedalB" + }, + "source": [ + "## Setup and Initialization\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "Support for reading html files was introduced in Spark NLP 5.5.2. Please make sure you have upgraded to the latest Spark NLP release.\n", + "\n", + "For local files example we will download an Excel file from Spark NLP Github repo:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ya8qZe00dalC", + "outputId": "f74142d4-2686-44b3-9428-7aafb2daf2e5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mkdir: cannot create directory ‘excel-files’: File exists\n", + "--2024-12-19 18:05:41-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1102-Adding-support-to-read-Excel-files/src/test/resources/reader/xls/vodafone.xlsx\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 12541 (12K) [application/octet-stream]\n", + "Saving to: ‘excel-files/vodafone.xlsx’\n", + "\n", + "vodafone.xlsx 100%[===================>] 12.25K --.-KB/s in 0s \n", + "\n", + "2024-12-19 18:05:41 (70.5 MB/s) - ‘excel-files/vodafone.xlsx’ saved [12541/12541]\n", + "\n" + ] + } + ], + "source": [ + "!mkdir excel-files\n", + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1102-Adding-support-to-read-Excel-files/src/test/resources/reader/xls/vodafone.xlsx -P excel-files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EoFI66NAdalE" + }, + "source": [ + "## Parsing Excel sheets from Local Files\n", + "Use the `xls()` method to parse Excel content from local directories." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bAkMjJ1vdalE", + "outputId": "30b31d1d-9d53-4298-abe8-3d87dac4b569" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "+--------------------+--------------------+--------------------+\n", + "| path| content| xls|\n", + "+--------------------+--------------------+--------------------+\n", + "|file:/content/exc...|[50 4B 03 04 14 0...|[{Title, Financia...|\n", + "+--------------------+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "xls_df = sparknlp.read().xls(\"./excel-files\")\n", + "\n", + "xls_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VWbUgoVQrO8m", + "outputId": "a48ba911-0058-495d-8c1e-758b9a116d4f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- path: string (nullable = true)\n", + " |-- content: binary (nullable = true)\n", + " |-- xls: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- elementType: string (nullable = true)\n", + " | | |-- content: string (nullable = true)\n", + " | | |-- metadata: map (nullable = true)\n", + " | | | |-- key: string\n", + " | | | |-- value: string (valueContainsNull = true)\n", + "\n" + ] + } + ], + "source": [ + "xls_df.printSchema()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VQD2k4E5dalF" + }, + "source": [ + "## Configuration Parameters\n", + "- Font Size: You can customize the font size used to identify paragraphs that should be treated as titles. By default, the font size is set to 9.\n", + "- Cell Separator: You can also customize the separator for each cell in the sheet. By defult, the separator is tab `\"\\t\"`\n", + "\n", + "However, if your Excel files require a different configuration, you can adjust this parameter accordingly. The example below demonstrates how to modify and work with this setting:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MMTGmxLQdalG", + "outputId": "21ebe4e8-ac54-4bfc-8489-34ba447f39d6" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning::Spark Session already created, some configs may not take.\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|xls |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {Title, ;Financial performance;;;;;;;;;, {SheetName -> Index}}, {Title, ;Topic;Period;;;Page;;;;;, {SheetName -> Index}}, {NarrativeText, ;Quarterly revenue;Nine quarters to 30 June 2023;;;1.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;Group financial performance;FY 22;FY 23;;2.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;Segmental results;FY 22;FY 23;;3.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;Segmental analysis;FY 22;FY 23;;4.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;Cash flow;FY 22;FY 23;;5.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {Title, ;Operational metrics;;;;;;;;;, {SheetName -> Index}}, {Title, ;Topic;Period;;;Page;;;;;, {SheetName -> Index}}, {NarrativeText, ;Mobile customers;Nine quarters to 30 June 2023;;;6.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;Fixed broadband customers;Nine quarters to 30 June 2023;;;7.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;Marketable homes passed;Nine quarters to 30 June 2023;;;8.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;TV customers;Nine quarters to 30 June 2023;;;9.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;Converged customers;Nine quarters to 30 June 2023;;;10.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;Mobile churn;Nine quarters to 30 June 2023;;;11.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;Mobile data usage;Nine quarters to 30 June 2023;;;12.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;Mobile ARPU;Nine quarters to 30 June 2023;;;13.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {Title, ;Other;;;;;;;;;, {SheetName -> Index}}, {Title, ;Topic;Period;;;Page;;;;;, {SheetName -> Index}}, {NarrativeText, ;Average foreign exchange rates;Nine quarters to 30 June 2023;;;14.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;Guidance rates;FY 23/24;;;14.0;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}, {NarrativeText, ;;;;;;;;;;, {SheetName -> Index}}]|\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "params = {\"titleFontSize\": \"9\", \"cellSeparator\": \";\"}\n", + "xls_df = sparknlp.read(params).xls(\"./excel-files\")\n", + "xls_df.select(\"xls\").show(truncate=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oBj0cHPXSD1m", + "outputId": "1b4543a6-e2ed-4e24-8e4e-56401da8e4df" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- path: string (nullable = true)\n", + " |-- content: binary (nullable = true)\n", + " |-- xls: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- elementType: string (nullable = true)\n", + " | | |-- content: string (nullable = true)\n", + " | | |-- metadata: map (nullable = true)\n", + " | | | |-- key: string\n", + " | | | |-- value: string (valueContainsNull = true)\n", + "\n" + ] + } + ], + "source": [ + "xls_df.printSchema()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BB2FEfegGuxl" + }, + "source": [ + "You can also use DFS file systems like:\n", + "- Databricks: `dbfs://`\n", + "- HDFS: `hdfs://`\n", + "- Microsoft Fabric OneLake: `abfss://`" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/src/test/scala/com/johnsnowlabs/reader/ExcelReaderTest.scala b/src/test/scala/com/johnsnowlabs/reader/ExcelReaderTest.scala index adc15b9732543d..a9e7498ee6a018 100644 --- a/src/test/scala/com/johnsnowlabs/reader/ExcelReaderTest.scala +++ b/src/test/scala/com/johnsnowlabs/reader/ExcelReaderTest.scala @@ -25,7 +25,7 @@ class ExcelReaderTest extends AnyFlatSpec { } "ExcelReader" should "read a directory of excel files with custom cell separator" taggedAs FastTest in { - val excelReader = new ExcelReader(cellSeparator = "\t") + val excelReader = new ExcelReader(cellSeparator = ";") val excelDf = excelReader.xls(s"$docDirectory/vodafone.xlsx") excelDf.select("xls").show(false)