From b4b4ab4546b66c1519d8c4750ba110e9f1e5c2f6 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Mon, 19 Jun 2023 08:50:22 +0200 Subject: [PATCH 01/13] Bump to 5.0.0-rc1 --- README.md | 88 +++++++++---------- build.sbt | 2 +- docs/en/concepts.md | 2 +- docs/en/examples.md | 4 +- docs/en/hardware_acceleration.md | 2 +- docs/en/install.md | 54 ++++++------ docs/en/spark_nlp.md | 2 +- examples/docker/README.md | 4 +- python/README.md | 88 +++++++++---------- python/docs/conf.py | 2 +- python/setup.py | 2 +- python/sparknlp/__init__.py | 4 +- scripts/colab_setup.sh | 2 +- scripts/kaggle_setup.sh | 2 +- scripts/sagemaker_setup.sh | 2 +- .../scala/com/johnsnowlabs/nlp/SparkNLP.scala | 2 +- .../scala/com/johnsnowlabs/util/Build.scala | 2 +- 17 files changed, 132 insertions(+), 132 deletions(-) diff --git a/README.md b/README.md index 1e7009ef6ab9c6..f527abbf7dac06 100644 --- a/README.md +++ b/README.md @@ -165,7 +165,7 @@ To use Spark NLP you need the following requirements: **GPU (optional):** -Spark NLP 4.4.4 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.0-rc1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 @@ -181,7 +181,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==4.4.4 pyspark==3.3.1 +$ pip install spark-nlp==5.0.0-rc1 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -226,7 +226,7 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *4.4.4* has been built on top of Apache Spark 3.2 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x +Spark NLP *5.0.0-rc1* has been built on top of Apache Spark 3.2 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x | Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | Apache Spark 3.4.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -265,7 +265,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 4.4.4 has been tested and is compatible with the following runtimes: +Spark NLP 5.0.0-rc1 has been tested and is compatible with the following runtimes: **CPU:** @@ -322,7 +322,7 @@ runtimes supporting CUDA 11 are 9.x and above as listed under GPU. ## EMR Support -Spark NLP 4.4.4 has been tested and is compatible with the following EMR releases: +Spark NLP 5.0.0-rc1 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -365,11 +365,11 @@ Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 ``` The `spark-nlp` has been published to @@ -378,11 +378,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.4 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0-rc1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.4 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0-rc1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.4 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0-rc1 ``` @@ -392,11 +392,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.4 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0-rc1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.4 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0-rc1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.4 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0-rc1 ``` @@ -406,11 +406,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.4 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0-rc1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.4 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0-rc1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.4 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0-rc1 ``` @@ -424,7 +424,7 @@ set in your SparkSession: spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 ``` ## Scala @@ -442,7 +442,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp_2.12 - 4.4.4 + 5.0.0-rc1 ``` @@ -453,7 +453,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 4.4.4 + 5.0.0-rc1 ``` @@ -464,7 +464,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 4.4.4 + 5.0.0-rc1 ``` @@ -475,7 +475,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 4.4.4 + 5.0.0-rc1 ``` @@ -485,28 +485,28 @@ coordinates: ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "4.4.4" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.0-rc1" ``` **spark-nlp-gpu:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "4.4.4" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.0-rc1" ``` **spark-nlp-aarch64:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "4.4.4" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.0-rc1" ``` **spark-nlp-silicon:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "4.4.4" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0-rc1" ``` Maven @@ -528,7 +528,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through Pip: ```bash -pip install spark-nlp==4.4.4 +pip install spark-nlp==5.0.0-rc1 ``` Conda: @@ -557,7 +557,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1") .getOrCreate() ``` @@ -628,7 +628,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 +com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -639,7 +639,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==4.4.4 +pip install spark-nlp==5.0.0-rc1 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -667,7 +667,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==4.4.4 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.0.0-rc1 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -684,7 +684,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -711,7 +711,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.4 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0-rc1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) @@ -734,7 +734,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.4 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0-rc1 ``` [Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live @@ -753,9 +753,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==4.4.4` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==5.0.0-rc1` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -806,7 +806,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1" } }] ``` @@ -815,7 +815,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 4.4.4" \ +--name "Spark NLP 5.0.0-rc1" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -879,7 +879,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -918,7 +918,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1") .getOrCreate() ``` @@ -932,7 +932,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 ``` **pyspark:** @@ -945,7 +945,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 ``` **Databricks:** @@ -1217,7 +1217,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars", "/tmp/spark-nlp-assembly-4.4.4.jar") + .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.0-rc1.jar") .getOrCreate() ``` @@ -1226,7 +1226,7 @@ spark = SparkSession.builder version (3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x) - If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( - i.e., `hdfs:///tmp/spark-nlp-assembly-4.4.4.jar`) + i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.0-rc1.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/build.sbt b/build.sbt index 83330ac60cd9ba..a5bfda5b4ecac1 100644 --- a/build.sbt +++ b/build.sbt @@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64) organization := "com.johnsnowlabs.nlp" -version := "4.4.4" +version := "5.0.0-rc1" (ThisBuild / scalaVersion) := scalaVer diff --git a/docs/en/concepts.md b/docs/en/concepts.md index e3d120d2cc7937..13e63d6d934fa6 100644 --- a/docs/en/concepts.md +++ b/docs/en/concepts.md @@ -62,7 +62,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==4.4.4 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.0.0-rc1 pyspark==3.3.1 jupyter $ jupyter notebook ``` diff --git a/docs/en/examples.md b/docs/en/examples.md index e42a18a2ba03a1..8f5a67f8b21f43 100644 --- a/docs/en/examples.md +++ b/docs/en/examples.md @@ -16,7 +16,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp -$ pip install spark-nlp==4.4.4 pyspark==3.3.1 +$ pip install spark-nlp==5.0.0-rc1 pyspark==3.3.1 ``` ## Google Colab Notebook @@ -36,7 +36,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!bash colab.sh -p 3.2.3 -s 4.4.4 +!bash colab.sh -p 3.2.3 -s 5.0.0-rc1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. diff --git a/docs/en/hardware_acceleration.md b/docs/en/hardware_acceleration.md index c0a2d66c476f04..2c3b6da8c5df1e 100644 --- a/docs/en/hardware_acceleration.md +++ b/docs/en/hardware_acceleration.md @@ -49,7 +49,7 @@ Since the new Transformer models such as BERT for Word and Sentence embeddings a | DeBERTa Large | +477%(5.8x) | | Longformer Base | +52%(1.5x) | -Spark NLP 4.4.4 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.0-rc1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 diff --git a/docs/en/install.md b/docs/en/install.md index cdc02ce49bfe58..fe55ac9031131c 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -15,22 +15,22 @@ sidebar: ```bash # Install Spark NLP from PyPI -pip install spark-nlp==4.4.4 +pip install spark-nlp==5.0.0-rc1 # Install Spark NLP from Anacodna/Conda conda install -c johnsnowlabs spark-nlp # Load Spark NLP with Spark Shell -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 # Load Spark NLP with PySpark -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 # Load Spark NLP with Spark Submit -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 # Load Spark NLP as external JAR after compiling and building Spark NLP by `sbt assembly` -spark-shell --jars spark-nlp-assembly-4.4.4.jar +spark-shell --jars spark-nlp-assembly-5.0.0-rc1.jar ``` ## Python @@ -49,7 +49,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp -$ pip install spark-nlp==4.4.4 pyspark==3.3.1 +$ pip install spark-nlp==5.0.0-rc1 pyspark==3.3.1 ``` Of course you will need to have jupyter installed in your system: @@ -76,7 +76,7 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4")\ + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1")\ .getOrCreate() ``` @@ -91,7 +91,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp_2.12 - 4.4.4 + 5.0.0-rc1 ``` @@ -102,7 +102,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 4.4.4 + 5.0.0-rc1 ``` @@ -113,7 +113,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 4.4.4 + 5.0.0-rc1 ``` @@ -124,7 +124,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 4.4.4 + 5.0.0-rc1 ``` @@ -134,28 +134,28 @@ spark = SparkSession.builder \ ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "4.4.4" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.0-rc1" ``` **spark-nlp-gpu:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "4.4.4" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.0-rc1" ``` **spark-nlp-silicon:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "4.4.4" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0-rc1" ``` **spark-nlp-aarch64:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "4.4.4" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.0-rc1" ``` Maven Central: [https://mvnrepository.com/artifact/com.johnsnowlabs.nlp](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp) @@ -233,7 +233,7 @@ maven coordinates like these: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 4.4.4 + 5.0.0-rc1 ``` @@ -241,7 +241,7 @@ or in case of sbt: ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "4.4.4" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0-rc1" ``` If everything went well, you can now start Spark NLP with the `m1` flag set to `true`: @@ -274,7 +274,7 @@ spark = sparknlp.start(apple_silicon=True) ## Installation for Linux Aarch64 Systems -Starting from version 4.4.4, Spark NLP supports Linux systems running on an aarch64 +Starting from version 5.0.0-rc1, Spark NLP supports Linux systems running on an aarch64 processor architecture. The necessary dependencies have been built on Ubuntu 16.04, so a recent system with an environment of at least that will be needed. @@ -318,7 +318,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.4 +!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0-rc1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. @@ -337,7 +337,7 @@ Run the following code in Kaggle Kernel and start using spark-nlp right away. ## Databricks Support -Spark NLP 4.4.4 has been tested and is compatible with the following runtimes: +Spark NLP 5.0.0-rc1 has been tested and is compatible with the following runtimes: **CPU:** @@ -403,7 +403,7 @@ NOTE: Spark NLP 4.0.x is based on TensorFlow 2.7.x which is compatible with CUDA 3.1. Install New -> PyPI -> `spark-nlp` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -419,7 +419,7 @@ Note: You can import these notebooks by using their URLs. ## EMR Support -Spark NLP 4.4.4 has been tested and is compatible with the following EMR releases: +Spark NLP 5.0.0-rc1 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -477,7 +477,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1" } } ] @@ -487,7 +487,7 @@ A sample of AWS CLI to launch EMR cluster: ```sh aws emr create-cluster \ ---name "Spark NLP 4.4.4" \ +--name "Spark NLP 5.0.0-rc1" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -741,7 +741,7 @@ We recommend using `conda` to manage your Python environment on Windows. Now you can use the downloaded binary by navigating to `%SPARK_HOME%\bin` and running -Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4*. +Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1*. @@ -767,12 +767,12 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars", "/tmp/spark-nlp-assembly-4.4.4.jar")\ + .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.0-rc1.jar")\ .getOrCreate() ``` - You can download provided Fat JARs from each [release notes](https://github.com/JohnSnowLabs/spark-nlp/releases), please pay attention to pick the one that suits your environment depending on the device (CPU/GPU) and Apache Spark version (3.x) -- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-4.4.4.jar`) +- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.0-rc1.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/docs/en/spark_nlp.md b/docs/en/spark_nlp.md index 8911f9c3bc22af..dce0861451334b 100644 --- a/docs/en/spark_nlp.md +++ b/docs/en/spark_nlp.md @@ -25,7 +25,7 @@ Spark NLP is built on top of **Apache Spark 3.x**. For using Spark NLP you need: **GPU (optional):** -Spark NLP 4.4.4 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.0-rc1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 diff --git a/examples/docker/README.md b/examples/docker/README.md index cc40ddfef07794..8b10a4d43782a6 100644 --- a/examples/docker/README.md +++ b/examples/docker/README.md @@ -73,7 +73,7 @@ docker run -it --name sparknlp-container \ --conf "spark.serializer"="org.apache.spark.serializer.KryoSerializer" \ --conf "spark.kryoserializer.buffer.max"="2000M" \ --conf "spark.driver.maxResultSize"="0" \ - --packages "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4" + --packages "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1" ``` To run the shell with GPU support, we use the image from [Jupyter Notebook with GPU @@ -91,5 +91,5 @@ docker run -it --name sparknlp-container \ --conf "spark.serializer"="org.apache.spark.serializer.KryoSerializer" \ --conf "spark.kryoserializer.buffer.max"="2000M" \ --conf "spark.driver.maxResultSize"="0" \ - --packages "com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.4" + --packages "com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0-rc1" ``` diff --git a/python/README.md b/python/README.md index 1e7009ef6ab9c6..f527abbf7dac06 100644 --- a/python/README.md +++ b/python/README.md @@ -165,7 +165,7 @@ To use Spark NLP you need the following requirements: **GPU (optional):** -Spark NLP 4.4.4 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.0-rc1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 @@ -181,7 +181,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==4.4.4 pyspark==3.3.1 +$ pip install spark-nlp==5.0.0-rc1 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -226,7 +226,7 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *4.4.4* has been built on top of Apache Spark 3.2 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x +Spark NLP *5.0.0-rc1* has been built on top of Apache Spark 3.2 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x | Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | Apache Spark 3.4.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -265,7 +265,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 4.4.4 has been tested and is compatible with the following runtimes: +Spark NLP 5.0.0-rc1 has been tested and is compatible with the following runtimes: **CPU:** @@ -322,7 +322,7 @@ runtimes supporting CUDA 11 are 9.x and above as listed under GPU. ## EMR Support -Spark NLP 4.4.4 has been tested and is compatible with the following EMR releases: +Spark NLP 5.0.0-rc1 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -365,11 +365,11 @@ Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 ``` The `spark-nlp` has been published to @@ -378,11 +378,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.4 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0-rc1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.4 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0-rc1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.4 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0-rc1 ``` @@ -392,11 +392,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.4 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0-rc1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.4 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0-rc1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.4 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0-rc1 ``` @@ -406,11 +406,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.4 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0-rc1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.4 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0-rc1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.4 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0-rc1 ``` @@ -424,7 +424,7 @@ set in your SparkSession: spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 ``` ## Scala @@ -442,7 +442,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp_2.12 - 4.4.4 + 5.0.0-rc1 ``` @@ -453,7 +453,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 4.4.4 + 5.0.0-rc1 ``` @@ -464,7 +464,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 4.4.4 + 5.0.0-rc1 ``` @@ -475,7 +475,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 4.4.4 + 5.0.0-rc1 ``` @@ -485,28 +485,28 @@ coordinates: ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "4.4.4" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.0-rc1" ``` **spark-nlp-gpu:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "4.4.4" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.0-rc1" ``` **spark-nlp-aarch64:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "4.4.4" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.0-rc1" ``` **spark-nlp-silicon:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "4.4.4" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0-rc1" ``` Maven @@ -528,7 +528,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through Pip: ```bash -pip install spark-nlp==4.4.4 +pip install spark-nlp==5.0.0-rc1 ``` Conda: @@ -557,7 +557,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1") .getOrCreate() ``` @@ -628,7 +628,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 +com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -639,7 +639,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==4.4.4 +pip install spark-nlp==5.0.0-rc1 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -667,7 +667,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==4.4.4 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.0.0-rc1 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -684,7 +684,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -711,7 +711,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.4 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0-rc1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) @@ -734,7 +734,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.4 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0-rc1 ``` [Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live @@ -753,9 +753,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==4.4.4` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==5.0.0-rc1` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -806,7 +806,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1" } }] ``` @@ -815,7 +815,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 4.4.4" \ +--name "Spark NLP 5.0.0-rc1" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -879,7 +879,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -918,7 +918,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1") .getOrCreate() ``` @@ -932,7 +932,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 ``` **pyspark:** @@ -945,7 +945,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.4 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 ``` **Databricks:** @@ -1217,7 +1217,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars", "/tmp/spark-nlp-assembly-4.4.4.jar") + .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.0-rc1.jar") .getOrCreate() ``` @@ -1226,7 +1226,7 @@ spark = SparkSession.builder version (3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x) - If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( - i.e., `hdfs:///tmp/spark-nlp-assembly-4.4.4.jar`) + i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.0-rc1.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/python/docs/conf.py b/python/docs/conf.py index 0ac5b6ac0781d1..1a63bec2bfdf1b 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -23,7 +23,7 @@ author = "John Snow Labs" # The full version, including alpha/beta/rc tags -release = "4.4.4" +release = "5.0.0-rc1" pyspark_version = "3.2.3" # -- General configuration --------------------------------------------------- diff --git a/python/setup.py b/python/setup.py index 8291ce89f2f2c3..ebd7988deb5e17 100644 --- a/python/setup.py +++ b/python/setup.py @@ -41,7 +41,7 @@ # project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='4.4.4', # Required + version='5.0.0-rc1', # Required # This is a one-line description or tagline of what your project does. This # corresponds to the 'Summary' metadata field: diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py index 4c398e87bc221a..5bc12684dfbcae 100644 --- a/python/sparknlp/__init__.py +++ b/python/sparknlp/__init__.py @@ -128,7 +128,7 @@ def start(gpu=False, The initiated Spark session. """ - current_version = "4.4.4" + current_version = "5.0.0-rc1" if params is None: params = {} @@ -298,4 +298,4 @@ def version(): str The current Spark NLP version. """ - return '4.4.4' + return '5.0.0-rc1' diff --git a/scripts/colab_setup.sh b/scripts/colab_setup.sh index 19a4d335643617..2757eb27031e1d 100644 --- a/scripts/colab_setup.sh +++ b/scripts/colab_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="4.4.4" +SPARKNLP="5.0.0-rc1" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/kaggle_setup.sh b/scripts/kaggle_setup.sh index 1f2138dcadc232..55825765028b25 100644 --- a/scripts/kaggle_setup.sh +++ b/scripts/kaggle_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="4.4.4" +SPARKNLP="5.0.0-rc1" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/sagemaker_setup.sh b/scripts/sagemaker_setup.sh index 78bdd2e972f5f6..7f074a47e0b36f 100644 --- a/scripts/sagemaker_setup.sh +++ b/scripts/sagemaker_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash # Default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="4.4.4" +SPARKNLP="5.0.0-rc1" PYSPARK="3.2.3" echo "Setup SageMaker for PySpark $PYSPARK and Spark NLP $SPARKNLP" diff --git a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala index ae495ee3dcf908..1eb5ab932ffbf9 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala @@ -20,7 +20,7 @@ import org.apache.spark.sql.SparkSession object SparkNLP { - val currentVersion = "4.4.4" + val currentVersion = "5.0.0-rc1" val MavenSpark3 = s"com.johnsnowlabs.nlp:spark-nlp_2.12:$currentVersion" val MavenGpuSpark3 = s"com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:$currentVersion" val MavenSparkSilicon = s"com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:$currentVersion" diff --git a/src/main/scala/com/johnsnowlabs/util/Build.scala b/src/main/scala/com/johnsnowlabs/util/Build.scala index e8f1667bcda04f..1393865a04f02c 100644 --- a/src/main/scala/com/johnsnowlabs/util/Build.scala +++ b/src/main/scala/com/johnsnowlabs/util/Build.scala @@ -17,5 +17,5 @@ package com.johnsnowlabs.util object Build { - val version: String = "4.4.4" + val version: String = "5.0.0-rc1" } From 5ade2945ad4c8c7a48dfdda3e6f2c9654e0ccb5b Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Fri, 30 Jun 2023 20:39:28 +0200 Subject: [PATCH 02/13] Update version to 5.0.0 [skip test] --- README.md | 90 +++++++++---------- build.sbt | 2 +- docs/_layouts/landing.html | 2 +- docs/en/concepts.md | 2 +- docs/en/examples.md | 4 +- docs/en/hardware_acceleration.md | 2 +- docs/en/install.md | 54 +++++------ docs/en/spark_nlp.md | 2 +- examples/docker/README.md | 4 +- python/README.md | 90 +++++++++---------- python/docs/conf.py | 2 +- python/setup.py | 2 +- python/sparknlp/__init__.py | 4 +- scripts/colab_setup.sh | 2 +- scripts/kaggle_setup.sh | 2 +- scripts/sagemaker_setup.sh | 2 +- .../scala/com/johnsnowlabs/nlp/SparkNLP.scala | 2 +- .../scala/com/johnsnowlabs/util/Build.scala | 2 +- 18 files changed, 135 insertions(+), 135 deletions(-) diff --git a/README.md b/README.md index f527abbf7dac06..2e6dd3d53d5a51 100644 --- a/README.md +++ b/README.md @@ -148,7 +148,7 @@ documentation and examples - Automatic Speech Recognition (Wav2Vec2) - Automatic Speech Recognition (HuBERT) - Named entity recognition (Deep learning) -- Easy TensorFlow integration +- Easy ONNX and TensorFlow integrations - GPU Support - Full integration with Spark ML functions - +12000 pre-trained models in +200 languages! @@ -165,7 +165,7 @@ To use Spark NLP you need the following requirements: **GPU (optional):** -Spark NLP 5.0.0-rc1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.0 is built with ONNX 1.15.1 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 @@ -181,7 +181,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.0-rc1 pyspark==3.3.1 +$ pip install spark-nlp==5.0.0 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -226,7 +226,7 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *5.0.0-rc1* has been built on top of Apache Spark 3.2 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x +Spark NLP *5.0.0* has been built on top of Apache Spark 3.2 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x | Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | Apache Spark 3.4.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -265,7 +265,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 5.0.0-rc1 has been tested and is compatible with the following runtimes: +Spark NLP 5.0.0 has been tested and is compatible with the following runtimes: **CPU:** @@ -322,7 +322,7 @@ runtimes supporting CUDA 11 are 9.x and above as listed under GPU. ## EMR Support -Spark NLP 5.0.0-rc1 has been tested and is compatible with the following EMR releases: +Spark NLP 5.0.0 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -365,11 +365,11 @@ Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 ``` The `spark-nlp` has been published to @@ -378,11 +378,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0-rc1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0-rc1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0-rc1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0 ``` @@ -392,11 +392,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0-rc1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0-rc1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0-rc1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0 ``` @@ -406,11 +406,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0-rc1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0-rc1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0-rc1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0 ``` @@ -424,7 +424,7 @@ set in your SparkSession: spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 ``` ## Scala @@ -442,7 +442,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp_2.12 - 5.0.0-rc1 + 5.0.0 ``` @@ -453,7 +453,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.0.0-rc1 + 5.0.0 ``` @@ -464,7 +464,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.0.0-rc1 + 5.0.0 ``` @@ -475,7 +475,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.0.0-rc1 + 5.0.0 ``` @@ -485,28 +485,28 @@ coordinates: ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.0-rc1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.0" ``` **spark-nlp-gpu:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.0-rc1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.0" ``` **spark-nlp-aarch64:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.0-rc1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.0" ``` **spark-nlp-silicon:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0-rc1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0" ``` Maven @@ -528,7 +528,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through Pip: ```bash -pip install spark-nlp==5.0.0-rc1 +pip install spark-nlp==5.0.0 ``` Conda: @@ -557,7 +557,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0") .getOrCreate() ``` @@ -628,7 +628,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 +com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -639,7 +639,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==5.0.0-rc1 +pip install spark-nlp==5.0.0 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -667,7 +667,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.0-rc1 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.0.0 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -684,7 +684,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -711,7 +711,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0-rc1 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) @@ -734,7 +734,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0-rc1 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0 ``` [Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live @@ -753,9 +753,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==5.0.0-rc1` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==5.0.0` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -806,7 +806,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0" } }] ``` @@ -815,7 +815,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 5.0.0-rc1" \ +--name "Spark NLP 5.0.0" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -879,7 +879,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -918,7 +918,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0") .getOrCreate() ``` @@ -932,7 +932,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 ``` **pyspark:** @@ -945,7 +945,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 ``` **Databricks:** @@ -1217,7 +1217,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.0-rc1.jar") + .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.0.jar") .getOrCreate() ``` @@ -1226,7 +1226,7 @@ spark = SparkSession.builder version (3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x) - If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( - i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.0-rc1.jar`) + i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.0.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/build.sbt b/build.sbt index a5bfda5b4ecac1..7821eecfcc413b 100644 --- a/build.sbt +++ b/build.sbt @@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64) organization := "com.johnsnowlabs.nlp" -version := "5.0.0-rc1" +version := "5.0.0" (ThisBuild / scalaVersion) := scalaVer diff --git a/docs/_layouts/landing.html b/docs/_layouts/landing.html index 38e17fbe09d7d1..b1724ad7c1e212 100755 --- a/docs/_layouts/landing.html +++ b/docs/_layouts/landing.html @@ -332,7 +332,7 @@

NLP Features

  • Microsoft Swin Transformer Image Classification
  • Facebook ConvNext Image Classification
  • Automatic Speech Recognition (Wav2Vec2 & HuBERT)
  • -
  • Easy TensorFlow integration
  • +
  • Easy ONNX and TensorFlow integrations
  • GPU Support
  • Full integration with Spark ML functions
  • 12000+ pre-trained models in 200+ languages! diff --git a/docs/en/concepts.md b/docs/en/concepts.md index 13e63d6d934fa6..fa58cbf7181663 100644 --- a/docs/en/concepts.md +++ b/docs/en/concepts.md @@ -62,7 +62,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.0-rc1 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.0.0 pyspark==3.3.1 jupyter $ jupyter notebook ``` diff --git a/docs/en/examples.md b/docs/en/examples.md index 8f5a67f8b21f43..4a321bb4fb0bdb 100644 --- a/docs/en/examples.md +++ b/docs/en/examples.md @@ -16,7 +16,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp -$ pip install spark-nlp==5.0.0-rc1 pyspark==3.3.1 +$ pip install spark-nlp==5.0.0 pyspark==3.3.1 ``` ## Google Colab Notebook @@ -36,7 +36,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!bash colab.sh -p 3.2.3 -s 5.0.0-rc1 +!bash colab.sh -p 3.2.3 -s 5.0.0 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. diff --git a/docs/en/hardware_acceleration.md b/docs/en/hardware_acceleration.md index 2c3b6da8c5df1e..82b6ffc3040a81 100644 --- a/docs/en/hardware_acceleration.md +++ b/docs/en/hardware_acceleration.md @@ -49,7 +49,7 @@ Since the new Transformer models such as BERT for Word and Sentence embeddings a | DeBERTa Large | +477%(5.8x) | | Longformer Base | +52%(1.5x) | -Spark NLP 5.0.0-rc1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.0 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 diff --git a/docs/en/install.md b/docs/en/install.md index fe55ac9031131c..0190c1d2047272 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -15,22 +15,22 @@ sidebar: ```bash # Install Spark NLP from PyPI -pip install spark-nlp==5.0.0-rc1 +pip install spark-nlp==5.0.0 # Install Spark NLP from Anacodna/Conda conda install -c johnsnowlabs spark-nlp # Load Spark NLP with Spark Shell -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 # Load Spark NLP with PySpark -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 # Load Spark NLP with Spark Submit -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 # Load Spark NLP as external JAR after compiling and building Spark NLP by `sbt assembly` -spark-shell --jars spark-nlp-assembly-5.0.0-rc1.jar +spark-shell --jars spark-nlp-assembly-5.0.0.jar ``` ## Python @@ -49,7 +49,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp -$ pip install spark-nlp==5.0.0-rc1 pyspark==3.3.1 +$ pip install spark-nlp==5.0.0 pyspark==3.3.1 ``` Of course you will need to have jupyter installed in your system: @@ -76,7 +76,7 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1")\ + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0")\ .getOrCreate() ``` @@ -91,7 +91,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp_2.12 - 5.0.0-rc1 + 5.0.0 ``` @@ -102,7 +102,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.0.0-rc1 + 5.0.0 ``` @@ -113,7 +113,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.0.0-rc1 + 5.0.0 ``` @@ -124,7 +124,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.0.0-rc1 + 5.0.0 ``` @@ -134,28 +134,28 @@ spark = SparkSession.builder \ ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.0-rc1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.0" ``` **spark-nlp-gpu:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.0-rc1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.0" ``` **spark-nlp-silicon:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0-rc1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0" ``` **spark-nlp-aarch64:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.0-rc1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.0" ``` Maven Central: [https://mvnrepository.com/artifact/com.johnsnowlabs.nlp](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp) @@ -233,7 +233,7 @@ maven coordinates like these: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.0.0-rc1 + 5.0.0 ``` @@ -241,7 +241,7 @@ or in case of sbt: ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0-rc1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0" ``` If everything went well, you can now start Spark NLP with the `m1` flag set to `true`: @@ -274,7 +274,7 @@ spark = sparknlp.start(apple_silicon=True) ## Installation for Linux Aarch64 Systems -Starting from version 5.0.0-rc1, Spark NLP supports Linux systems running on an aarch64 +Starting from version 5.0.0, Spark NLP supports Linux systems running on an aarch64 processor architecture. The necessary dependencies have been built on Ubuntu 16.04, so a recent system with an environment of at least that will be needed. @@ -318,7 +318,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0-rc1 +!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. @@ -337,7 +337,7 @@ Run the following code in Kaggle Kernel and start using spark-nlp right away. ## Databricks Support -Spark NLP 5.0.0-rc1 has been tested and is compatible with the following runtimes: +Spark NLP 5.0.0 has been tested and is compatible with the following runtimes: **CPU:** @@ -403,7 +403,7 @@ NOTE: Spark NLP 4.0.x is based on TensorFlow 2.7.x which is compatible with CUDA 3.1. Install New -> PyPI -> `spark-nlp` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -419,7 +419,7 @@ Note: You can import these notebooks by using their URLs. ## EMR Support -Spark NLP 5.0.0-rc1 has been tested and is compatible with the following EMR releases: +Spark NLP 5.0.0 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -477,7 +477,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0" } } ] @@ -487,7 +487,7 @@ A sample of AWS CLI to launch EMR cluster: ```sh aws emr create-cluster \ ---name "Spark NLP 5.0.0-rc1" \ +--name "Spark NLP 5.0.0" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -741,7 +741,7 @@ We recommend using `conda` to manage your Python environment on Windows. Now you can use the downloaded binary by navigating to `%SPARK_HOME%\bin` and running -Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1*. +Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0*. @@ -767,12 +767,12 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.0-rc1.jar")\ + .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.0.jar")\ .getOrCreate() ``` - You can download provided Fat JARs from each [release notes](https://github.com/JohnSnowLabs/spark-nlp/releases), please pay attention to pick the one that suits your environment depending on the device (CPU/GPU) and Apache Spark version (3.x) -- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.0-rc1.jar`) +- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.0.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/docs/en/spark_nlp.md b/docs/en/spark_nlp.md index dce0861451334b..5d0a07bc7bd110 100644 --- a/docs/en/spark_nlp.md +++ b/docs/en/spark_nlp.md @@ -25,7 +25,7 @@ Spark NLP is built on top of **Apache Spark 3.x**. For using Spark NLP you need: **GPU (optional):** -Spark NLP 5.0.0-rc1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.0 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 diff --git a/examples/docker/README.md b/examples/docker/README.md index 8b10a4d43782a6..bdf00be0a55508 100644 --- a/examples/docker/README.md +++ b/examples/docker/README.md @@ -73,7 +73,7 @@ docker run -it --name sparknlp-container \ --conf "spark.serializer"="org.apache.spark.serializer.KryoSerializer" \ --conf "spark.kryoserializer.buffer.max"="2000M" \ --conf "spark.driver.maxResultSize"="0" \ - --packages "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1" + --packages "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0" ``` To run the shell with GPU support, we use the image from [Jupyter Notebook with GPU @@ -91,5 +91,5 @@ docker run -it --name sparknlp-container \ --conf "spark.serializer"="org.apache.spark.serializer.KryoSerializer" \ --conf "spark.kryoserializer.buffer.max"="2000M" \ --conf "spark.driver.maxResultSize"="0" \ - --packages "com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0-rc1" + --packages "com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0" ``` diff --git a/python/README.md b/python/README.md index f527abbf7dac06..2e6dd3d53d5a51 100644 --- a/python/README.md +++ b/python/README.md @@ -148,7 +148,7 @@ documentation and examples - Automatic Speech Recognition (Wav2Vec2) - Automatic Speech Recognition (HuBERT) - Named entity recognition (Deep learning) -- Easy TensorFlow integration +- Easy ONNX and TensorFlow integrations - GPU Support - Full integration with Spark ML functions - +12000 pre-trained models in +200 languages! @@ -165,7 +165,7 @@ To use Spark NLP you need the following requirements: **GPU (optional):** -Spark NLP 5.0.0-rc1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.0 is built with ONNX 1.15.1 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 @@ -181,7 +181,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.0-rc1 pyspark==3.3.1 +$ pip install spark-nlp==5.0.0 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -226,7 +226,7 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *5.0.0-rc1* has been built on top of Apache Spark 3.2 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x +Spark NLP *5.0.0* has been built on top of Apache Spark 3.2 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x | Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | Apache Spark 3.4.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -265,7 +265,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 5.0.0-rc1 has been tested and is compatible with the following runtimes: +Spark NLP 5.0.0 has been tested and is compatible with the following runtimes: **CPU:** @@ -322,7 +322,7 @@ runtimes supporting CUDA 11 are 9.x and above as listed under GPU. ## EMR Support -Spark NLP 5.0.0-rc1 has been tested and is compatible with the following EMR releases: +Spark NLP 5.0.0 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -365,11 +365,11 @@ Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 ``` The `spark-nlp` has been published to @@ -378,11 +378,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0-rc1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0-rc1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0-rc1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.0 ``` @@ -392,11 +392,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0-rc1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0-rc1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0-rc1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.0 ``` @@ -406,11 +406,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0-rc1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0-rc1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0-rc1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.0 ``` @@ -424,7 +424,7 @@ set in your SparkSession: spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 ``` ## Scala @@ -442,7 +442,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp_2.12 - 5.0.0-rc1 + 5.0.0 ``` @@ -453,7 +453,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.0.0-rc1 + 5.0.0 ``` @@ -464,7 +464,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.0.0-rc1 + 5.0.0 ``` @@ -475,7 +475,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.0.0-rc1 + 5.0.0 ``` @@ -485,28 +485,28 @@ coordinates: ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.0-rc1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.0" ``` **spark-nlp-gpu:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.0-rc1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.0" ``` **spark-nlp-aarch64:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.0-rc1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.0" ``` **spark-nlp-silicon:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0-rc1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.0" ``` Maven @@ -528,7 +528,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through Pip: ```bash -pip install spark-nlp==5.0.0-rc1 +pip install spark-nlp==5.0.0 ``` Conda: @@ -557,7 +557,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0") .getOrCreate() ``` @@ -628,7 +628,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 +com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -639,7 +639,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==5.0.0-rc1 +pip install spark-nlp==5.0.0 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -667,7 +667,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.0-rc1 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.0.0 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -684,7 +684,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -711,7 +711,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0-rc1 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) @@ -734,7 +734,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0-rc1 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.0 ``` [Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live @@ -753,9 +753,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==5.0.0-rc1` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==5.0.0` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -806,7 +806,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0" } }] ``` @@ -815,7 +815,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 5.0.0-rc1" \ +--name "Spark NLP 5.0.0" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -879,7 +879,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -918,7 +918,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0") .getOrCreate() ``` @@ -932,7 +932,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 ``` **pyspark:** @@ -945,7 +945,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0-rc1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 ``` **Databricks:** @@ -1217,7 +1217,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.0-rc1.jar") + .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.0.jar") .getOrCreate() ``` @@ -1226,7 +1226,7 @@ spark = SparkSession.builder version (3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x) - If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( - i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.0-rc1.jar`) + i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.0.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/python/docs/conf.py b/python/docs/conf.py index 1a63bec2bfdf1b..faddea1aa5c952 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -23,7 +23,7 @@ author = "John Snow Labs" # The full version, including alpha/beta/rc tags -release = "5.0.0-rc1" +release = "5.0.0" pyspark_version = "3.2.3" # -- General configuration --------------------------------------------------- diff --git a/python/setup.py b/python/setup.py index ebd7988deb5e17..a4c0594f6ce900 100644 --- a/python/setup.py +++ b/python/setup.py @@ -41,7 +41,7 @@ # project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='5.0.0-rc1', # Required + version='5.0.0', # Required # This is a one-line description or tagline of what your project does. This # corresponds to the 'Summary' metadata field: diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py index 5bc12684dfbcae..5eca3c05193f9f 100644 --- a/python/sparknlp/__init__.py +++ b/python/sparknlp/__init__.py @@ -128,7 +128,7 @@ def start(gpu=False, The initiated Spark session. """ - current_version = "5.0.0-rc1" + current_version = "5.0.0" if params is None: params = {} @@ -298,4 +298,4 @@ def version(): str The current Spark NLP version. """ - return '5.0.0-rc1' + return '5.0.0' diff --git a/scripts/colab_setup.sh b/scripts/colab_setup.sh index 2757eb27031e1d..09d14bb884778a 100644 --- a/scripts/colab_setup.sh +++ b/scripts/colab_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.0.0-rc1" +SPARKNLP="5.0.0" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/kaggle_setup.sh b/scripts/kaggle_setup.sh index 55825765028b25..30c3041c84018b 100644 --- a/scripts/kaggle_setup.sh +++ b/scripts/kaggle_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.0.0-rc1" +SPARKNLP="5.0.0" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/sagemaker_setup.sh b/scripts/sagemaker_setup.sh index 7f074a47e0b36f..a8839055c6387f 100644 --- a/scripts/sagemaker_setup.sh +++ b/scripts/sagemaker_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash # Default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.0.0-rc1" +SPARKNLP="5.0.0" PYSPARK="3.2.3" echo "Setup SageMaker for PySpark $PYSPARK and Spark NLP $SPARKNLP" diff --git a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala index 1eb5ab932ffbf9..12904be9b6bd97 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala @@ -20,7 +20,7 @@ import org.apache.spark.sql.SparkSession object SparkNLP { - val currentVersion = "5.0.0-rc1" + val currentVersion = "5.0.0" val MavenSpark3 = s"com.johnsnowlabs.nlp:spark-nlp_2.12:$currentVersion" val MavenGpuSpark3 = s"com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:$currentVersion" val MavenSparkSilicon = s"com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:$currentVersion" diff --git a/src/main/scala/com/johnsnowlabs/util/Build.scala b/src/main/scala/com/johnsnowlabs/util/Build.scala index 1393865a04f02c..0e1254219d5164 100644 --- a/src/main/scala/com/johnsnowlabs/util/Build.scala +++ b/src/main/scala/com/johnsnowlabs/util/Build.scala @@ -17,5 +17,5 @@ package com.johnsnowlabs.util object Build { - val version: String = "5.0.0-rc1" + val version: String = "5.0.0" } From ae688ab8ac511998ccd08ecb72e80f8b28e87abf Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Sat, 1 Jul 2023 23:07:33 +1000 Subject: [PATCH 03/13] SPARKNLP 836 - Introducing "Instructor Embeddings" for sentence embeddings like Instructor-XL model (#13849) * Added Instructor Embeddings * Added Instructor Embeddings python code * fixed broadcast bug * fixed broadcast bug * Changed test type to slow --- .../sparknlp/annotator/embeddings/__init__.py | 1 + .../embeddings/instructor_embeddings.py | 204 ++++++++ python/sparknlp/internal/__init__.py | 5 + .../embeddings/instructor_embeddings_test.py | 85 ++++ .../com/johnsnowlabs/ml/ai/Instructor.scala | 211 +++++++++ .../sign/ModelSignatureConstants.scala | 5 + .../nlp/embeddings/InstructorEmbeddings.scala | 440 ++++++++++++++++++ .../InstructorEmbeddingsTestSpec.scala | 65 +++ 8 files changed, 1016 insertions(+) create mode 100755 python/sparknlp/annotator/embeddings/instructor_embeddings.py create mode 100644 python/test/annotator/embeddings/instructor_embeddings_test.py create mode 100644 src/main/scala/com/johnsnowlabs/ml/ai/Instructor.scala create mode 100644 src/main/scala/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddings.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddingsTestSpec.scala diff --git a/python/sparknlp/annotator/embeddings/__init__.py b/python/sparknlp/annotator/embeddings/__init__.py index 13501c98901163..02ee80f98fa264 100644 --- a/python/sparknlp/annotator/embeddings/__init__.py +++ b/python/sparknlp/annotator/embeddings/__init__.py @@ -22,6 +22,7 @@ from sparknlp.annotator.embeddings.distil_bert_embeddings import * from sparknlp.annotator.embeddings.doc2vec import * from sparknlp.annotator.embeddings.elmo_embeddings import * +from sparknlp.annotator.embeddings.instructor_embeddings import * from sparknlp.annotator.embeddings.longformer_embeddings import * from sparknlp.annotator.embeddings.roberta_embeddings import * from sparknlp.annotator.embeddings.roberta_sentence_embeddings import * diff --git a/python/sparknlp/annotator/embeddings/instructor_embeddings.py b/python/sparknlp/annotator/embeddings/instructor_embeddings.py new file mode 100755 index 00000000000000..31ca3c7fd52723 --- /dev/null +++ b/python/sparknlp/annotator/embeddings/instructor_embeddings.py @@ -0,0 +1,204 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes for BertEmbeddings.""" + +from sparknlp.common import * + + +class InstructorEmbeddings(AnnotatorModel, + HasEmbeddingsProperties, + HasCaseSensitiveProperties, + HasStorageRef, + HasBatchedAnnotate, + HasMaxSentenceLengthLimit): + """Sentence embeddings using INSTRUCTOR. + + Instructor👨‍🏫, an instruction-finetuned text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) and domains (e.g., science, finance, etc.) by simply providing the task instruction, without any finetuning. Instructor👨‍ achieves sota on 70 diverse embedding tasks! + + Pretrained models can be loaded with :meth:`.pretrained` of the companion + object: + + >>> embeddings = InstructorEmbeddings.pretrained() \\ + ... .setInputCols(["document"]) \\ + ... .setInstruction("Represent the Medicine sentence for clustering: ") \\ + ... .setOutputCol("instructor_embeddings") + + + The default model is ``"instructor_base"``, if no name is provided. + + For available pretrained models please see the + `Models Hub `__. + + + ====================== ====================== + Input Annotation types Output Annotation type + ====================== ====================== + ``DOCUMENT`` ``SENTENCE_EMBEDDINGS`` + ====================== ====================== + + Parameters + ---------- + batchSize + Size of every batch , by default 8 + dimension + Number of embedding dimensions, by default 768 + caseSensitive + Whether to ignore case in tokens for embeddings matching, by default False + instruction + Set transformer instruction, e.g. 'summarize:' + maxSentenceLength + Max sentence length to process, by default 128 + configProtoBytes + ConfigProto from tensorflow, serialized into byte array. + + References + ---------- + `One Embedder, Any Task: Instruction-Finetuned Text Embeddings `__ + + https://github.com/HKUNLP/instructor-embedding/ + + **Paper abstract** + + *We introduce INSTRUCTOR, a new method for computing text embeddings given task instructions: + every text input is embedded together with instructions explaining the use case (e.g., task and + domain descriptions). Unlike encoders from prior work that are more specialized, INSTRUCTOR is a + single embedder that can generate text embeddings tailored to different downstream tasks and domains, + without any further training. We first annotate instructions for 330 diverse tasks and train INSTRUCTOR + on this multitask mixture with a contrastive loss. We evaluate INSTRUCTOR on 70 embedding evaluation tasks + (66 of which are unseen during training), ranging from classification and information retrieval to semantic + textual similarity and text generation evaluation. INSTRUCTOR, while having an order of magnitude fewer + parameters than the previous best model, achieves state-of-the-art performance, with an average improvement + of 3.4% compared to the previous best results on the 70 diverse datasets. Our analysis suggests that + INSTRUCTOR is robust to changes in instructions, and that instruction finetuning mitigates the challenge of + training a single model on diverse datasets. Our model, code, and data are available at this https + URL .* + + Examples + -------- + >>> import sparknlp + >>> from sparknlp.base import * + >>> from sparknlp.annotator import * + >>> from pyspark.ml import Pipeline + >>> documentAssembler = DocumentAssembler() \\ + ... .setInputCol("text") \\ + ... .setOutputCol("document") + >>> embeddings = InstructorEmbeddings.pretrained() \\ + ... .setInputCols(["document"]) \\ + ... .setInstruction("Represent the Medicine sentence for clustering: ") \\ + ... .setOutputCol("instructor_embeddings") + >>> embeddingsFinisher = EmbeddingsFinisher() \\ + ... .setInputCols(["instructor_embeddings"]) \\ + ... .setOutputCols("finished_embeddings") \\ + ... .setOutputAsVector(True) + >>> pipeline = Pipeline().setStages([ + ... documentAssembler, + ... embeddings, + ... embeddingsFinisher + ... ]) + >>> data = spark.createDataFrame([["Dynamical Scalar Degree of Freedom in Horava-Lifshitz Gravity"]]).toDF("text") + >>> result = pipeline.fit(data).transform(data) + >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80) + +--------------------------------------------------------------------------------+ + | result| + +--------------------------------------------------------------------------------+ + |[-2.3497989177703857,0.480538547039032,-0.3238905668258667,-1.612930893898010...| + +--------------------------------------------------------------------------------+ + """ + + name = "InstructorEmbeddings" + + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + + outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS + instruction = Param(Params._dummy(), "instruction", "Set transformer instruction, e.g. 'summarize:'", + typeConverter=TypeConverters.toString) + configProtoBytes = Param(Params._dummy(), + "configProtoBytes", + "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", + TypeConverters.toListInt) + + def setInstruction(self, value): + """ Sets transformer instruction, e.g. 'summarize:'. + + Parameters + ---------- + value : str + """ + return self._set(instruction=value) + + def setConfigProtoBytes(self, b): + """Sets configProto from tensorflow, serialized into byte array. + + Parameters + ---------- + b : List[int] + ConfigProto from tensorflow, serialized into byte array + """ + return self._set(configProtoBytes=b) + + @keyword_only + def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.InstructorEmbeddings", java_model=None): + super(InstructorEmbeddings, self).__init__( + classname=classname, + java_model=java_model + ) + self._setDefault( + dimension=768, + batchSize=8, + maxSentenceLength=128, + caseSensitive=False, + instruction="", + ) + + @staticmethod + def loadSavedModel(folder, spark_session): + """Loads a locally saved model. + + Parameters + ---------- + folder : str + Folder of the saved model + spark_session : pyspark.sql.SparkSession + The current SparkSession + + Returns + ------- + InstructorEmbeddings + The restored model + """ + from sparknlp.internal import _InstructorLoader + jModel = _InstructorLoader(folder, spark_session._jsparkSession)._java_obj + return InstructorEmbeddings(java_model=jModel) + + @staticmethod + def pretrained(name="instructor_base", lang="en", remote_loc=None): + """Downloads and loads a pretrained model. + + Parameters + ---------- + name : str, optional + Name of the pretrained model, by default "instructor_base" + lang : str, optional + Language of the pretrained model, by default "en" + remote_loc : str, optional + Optional remote address of the resource, by default None. Will use + Spark NLPs repositories otherwise. + + Returns + ------- + InstructorEmbeddings + The restored model + """ + from sparknlp.pretrained import ResourceDownloader + return ResourceDownloader.downloadModel(InstructorEmbeddings, name, lang, remote_loc) diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py index 21b48e7693b793..9131d37788b9e4 100644 --- a/python/sparknlp/internal/__init__.py +++ b/python/sparknlp/internal/__init__.py @@ -529,3 +529,8 @@ def __init__(self, path, jspark): super(_RoBertaForZeroShotClassification, self).__init__( "com.johnsnowlabs.nlp.annotators.classifier.dl.RoBertaForZeroShotClassification.loadSavedModel", path, jspark) + + +class _InstructorLoader(ExtendedJavaWrapper): + def __init__(self, path, jspark): + super(_InstructorLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.InstructorEmbeddings.loadSavedModel", path, jspark) \ No newline at end of file diff --git a/python/test/annotator/embeddings/instructor_embeddings_test.py b/python/test/annotator/embeddings/instructor_embeddings_test.py new file mode 100644 index 00000000000000..4dcfd32f060d45 --- /dev/null +++ b/python/test/annotator/embeddings/instructor_embeddings_test.py @@ -0,0 +1,85 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class InstructorEmbeddingsTestSpec(unittest.TestCase): + def setUp(self): + self.spark = SparkContextForTest.spark + self.tested_annotator = InstructorEmbeddings.pretrained() \ + .setInstruction("Represent the Wikipedia document for retrieval: ") \ + .setInputCols(["documents"]) \ + .setOutputCol("instructor") + + def runTest(self): + data = self.spark.createDataFrame([ + [1, """Capitalism has been dominant in the Western world since the end of feudalism, but most feel[who?] that + the term "mixed economies" more precisely describes most contemporary economies, due to their containing both + private-owned and state-owned enterprises. In capitalism, prices determine the demand-supply scale. For + example, higher demand for certain goods and services lead to higher prices and lower demand for certain + goods lead to lower prices. """]]).toDF("id", "text") + + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("documents") + + instruction = self.tested_annotator + + pipeline = Pipeline().setStages([document_assembler, instruction]) + results = pipeline.fit(data).transform(data) + + results.select("instructor.embeddings").show(truncate=False) + +# +# @pytest.mark.slow +# class BertEmbeddingsLoadSavedModelTestSpec(unittest.TestCase): +# +# def setUp(self): +# self.data = SparkContextForTest.spark.read.option("header", "true") \ +# .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") +# +# def runTest(self): +# document_assembler = DocumentAssembler() \ +# .setInputCol("text") \ +# .setOutputCol("document") +# sentence_detector = SentenceDetector() \ +# .setInputCols(["document"]) \ +# .setOutputCol("sentence") +# tokenizer = Tokenizer() \ +# .setInputCols(["sentence"]) \ +# .setOutputCol("token") +# albert = BertEmbeddings.loadSavedModel(os.getcwd() + "/../src/test/resources/tf-hub-bert/model", +# SparkContextForTest.spark) \ +# .setInputCols(["sentence", "token"]) \ +# .setOutputCol("embeddings") +# +# pipeline = Pipeline(stages=[ +# document_assembler, +# sentence_detector, +# tokenizer, +# albert +# ]) +# +# model = pipeline.fit(self.data) +# model.write().overwrite().save("./tmp_bert_pipeline_model") +# model.transform(self.data).show() diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Instructor.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Instructor.scala new file mode 100644 index 00000000000000..1507da55a59de7 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/ml/ai/Instructor.scala @@ -0,0 +1,211 @@ +/* + * Copyright 2017 - 2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.ml.ai + +import com.johnsnowlabs.ml.tensorflow.sentencepiece.SentencePieceWrapper +import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} +import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} +import com.johnsnowlabs.nlp.{Annotation, AnnotatorType} + +import scala.collection.JavaConverters._ + +/** InstructorEmbeddings provides the functionality to generate embeddings for instruction and + * task + * @param tensorflow + * tensorflow wrapper + * @param configProtoBytes + * configProtoBytes + * @param spp + * SentencePieceWrapper + * @param signatures + * signatures + */ + +private[johnsnowlabs] class Instructor( + val tensorflow: TensorflowWrapper, + val spp: SentencePieceWrapper, + configProtoBytes: Option[Array[Byte]] = None, + signatures: Option[Map[String, String]] = None) + extends Serializable { + + private val _tfInstructorSignatures: Map[String, String] = + signatures.getOrElse(ModelSignatureManager.apply()) + private val paddingTokenId = 0 + private val eosTokenId = 1 + + /** Get sentence embeddings for a batch of sentences + * @param batch + * batch of sentences + * @param contextLengths + * context lengths + * @return + * sentence embeddings + */ + private def getSentenceEmbedding( + batch: Seq[Array[Int]], + contextLengths: Seq[Int]): Array[Array[Float]] = { + // get max sentence length + val sequencesLength = batch.map(x => x.length).toArray + val maxSentenceLength = sequencesLength.max + val batchLength = batch.length + + // encode batch + val tensorEncoder = new TensorResources() + val inputDim = batch.length * maxSentenceLength + + // create buffers + val encoderInputBuffers = tensorEncoder.createIntBuffer(inputDim) + val encoderAttentionMaskBuffers = tensorEncoder.createIntBuffer(inputDim) + val encoderContextMaskBuffers = tensorEncoder.createIntBuffer(inputDim) + + val shape = Array(batch.length.toLong, maxSentenceLength) + + batch.zipWithIndex.foreach { case (tokenIds, idx) => + val offset = idx * maxSentenceLength + val diff = maxSentenceLength - tokenIds.length + + // pad with 0 + val s = tokenIds.take(maxSentenceLength) ++ Array.fill[Int](diff)(this.paddingTokenId) + encoderInputBuffers.offset(offset).write(s) + + // create attention mask + val mask = s.map(x => if (x != this.paddingTokenId) 1 else 0) + encoderAttentionMaskBuffers.offset(offset).write(mask) + + // create context mask + val contextMask = mask.zipWithIndex.map { + case (x, i) => { if (i < contextLengths(idx)) 0 else x } + } + encoderContextMaskBuffers.offset(offset).write(contextMask) + } + + // create tensors + val encoderInputTensors = tensorEncoder.createIntBufferTensor(shape, encoderInputBuffers) + val encoderAttentionMaskTensors = + tensorEncoder.createIntBufferTensor(shape, encoderAttentionMaskBuffers) + val encoderContextMaskTensors = + tensorEncoder.createIntBufferTensor(shape, encoderContextMaskBuffers) + + // run model + val runner = tensorflow + .getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + initAllTables = false, + savedSignatures = signatures) + .runner + + runner + .feed( + _tfInstructorSignatures.getOrElse( + ModelSignatureConstants.EncoderInputIds.key, + "missing_encoder_input_ids"), + encoderInputTensors) + .feed( + _tfInstructorSignatures.getOrElse( + ModelSignatureConstants.EncoderAttentionMask.key, + "missing_encoder_attention_mask"), + encoderAttentionMaskTensors) + .feed( + _tfInstructorSignatures.getOrElse( + ModelSignatureConstants.EncoderContextMask.key, + "missing_encoder_context_mask"), + encoderContextMaskTensors) + .fetch(_tfInstructorSignatures + .getOrElse(ModelSignatureConstants.LastHiddenState.key, "missing_last_hidden_state")) + + // get embeddings + val sentenceEmbeddings = runner.run().asScala + val sentenceEmbeddingsFloats = TensorResources.extractFloats(sentenceEmbeddings.head) + val dim = sentenceEmbeddingsFloats.length / batchLength + + // group embeddings + val sentenceEmbeddingsFloatsArray = sentenceEmbeddingsFloats.grouped(dim).toArray + + // close buffers + sentenceEmbeddings.foreach(_.close()) + encoderInputTensors.close() + encoderAttentionMaskTensors.close() + encoderContextMaskTensors.close() + tensorEncoder.clearTensors() + tensorEncoder.clearSession(sentenceEmbeddings) + + sentenceEmbeddingsFloatsArray + } + + /** Tokenize sentences + * @param sentences + * sentences + * @param task + * task + * @param maxSentenceLength + * max sentence length + * @return + */ + def tokenize( + sentences: Seq[Annotation], + task: String, + maxSentenceLength: Int): Seq[Array[Int]] = { + sentences.map(s => { + val sentWithTask = if (task.nonEmpty) task.concat("").concat(s.result) else s.result + spp.getSppModel.encodeAsIds(sentWithTask).take(maxSentenceLength - 1) ++ Array( + this.eosTokenId) + }) + } + + /** Predict sentence embeddings + * @param sentences + * sentences + * @param batchSize + * batch size + * @param maxSentenceLength + * max sentence length + * @param instruction + * instruction + * @return + */ + def predict( + sentences: Seq[Annotation], + batchSize: Int, + maxSentenceLength: Int, + instruction: String): Seq[Annotation] = { + + val instructionTokenized = spp.getSppModel.encodeAsIds(instruction) + // repeat instruction length for each sentence + val instructionTokenizedRepeated: Array[Int] = + Array.fill(sentences.length)(instructionTokenized.length) + + val batchEmbeddings = sentences.grouped(batchSize).toArray.flatMap { batch => + // encode batch + val batchSP = tokenize(batch, instruction, maxSentenceLength) + // get sentence embeddings + val sentenceEmbeddings = getSentenceEmbedding(batchSP, instructionTokenizedRepeated) + + // create annotations + batch.zip(sentenceEmbeddings).map { case (sentence, vectors) => + Annotation( + annotatorType = AnnotatorType.SENTENCE_EMBEDDINGS, + begin = sentence.begin, + end = sentence.end, + result = sentence.result, + metadata = sentence.metadata, + embeddings = vectors) + } + } + batchEmbeddings + } + +} diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants.scala index 383c6b8751582a..94516652e6f30b 100644 --- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants.scala +++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants.scala @@ -273,6 +273,11 @@ object ModelSignatureConstants { override val value: String = "StatefulPartitionedCall_1:2" } + case object EncoderContextMask extends TFInfoNameMapper { + override val key: String = "encoder_context_mask" + override val value: String = "encoder_encoder_context_mask:0" + } + /** Retrieve signature patterns for a given provider * * @param modelProvider diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddings.scala new file mode 100644 index 00000000000000..8caee7749a00ad --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddings.scala @@ -0,0 +1,440 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.embeddings + +import com.johnsnowlabs.ml.ai.Instructor +import com.johnsnowlabs.ml.tensorflow._ +import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ + ReadSentencePieceModel, + SentencePieceWrapper, + WriteSentencePieceModel +} +import com.johnsnowlabs.ml.util.LoadExternalModel.{ + loadSentencePieceAsset, + modelSanityCheck, + notSupportedEngineError +} +import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.serialization.MapFeature +import com.johnsnowlabs.storage.HasStorageRef +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +/** Sentence embeddings using INSTRUCTOR. + * + * Instructor👨‍🏫, an instruction-finetuned text embedding model that can generate text + * embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, + * etc.) and domains (e.g., science, finance, etc.) by simply providing the task instruction, + * without any finetuning. Instructor👨‍ achieves sota on 70 diverse embedding tasks! + * + * Pretrained models can be loaded with `pretrained` of the companion object: + * {{{ + * val embeddings = InstructorEmbeddings.pretrained() + * .setInputCols("document") + * .setOutputCol("instructor_embeddings") + * }}} + * The default model is `"instructor_base"`, if no name is provided. + * + * For available pretrained models please see the + * [[https://sparknlp.org/models?q=Instructor Models Hub]]. + * + * For extended examples of usage, see + * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddingsTestSpec.scala InstructorEmbeddingsTestSpec]]. + * + * '''Sources''' : + * + * [[https://arxiv.org/abs/2212.09741 One Embedder, Any Task: Instruction-Finetuned Text Embeddings]] + * + * [[https://github.com/HKUNLP/instructor-embedding/ INSTRUCTOR Github Repository]] + * + * ''' Paper abstract ''' + * + * ''We introduce INSTRUCTOR, a new method for computing text embeddings given task instructions: + * every text input is embedded together with instructions explaining the use case (e.g., task + * and domain descriptions). Unlike encoders from prior work that are more specialized, + * INSTRUCTOR is a single embedder that can generate text embeddings tailored to different + * downstream tasks and domains, without any further training. We first annotate instructions for + * 330 diverse tasks and train INSTRUCTOR on this multitask mixture with a contrastive loss. We + * evaluate INSTRUCTOR on 70 embedding evaluation tasks (66 of which are unseen during training), + * ranging from classification and information retrieval to semantic textual similarity and text + * generation evaluation. INSTRUCTOR, while having an order of magnitude fewer parameters than + * the previous best model, achieves state-of-the-art performance, with an average improvement of + * 3.4% compared to the previous best results on the 70 diverse datasets. Our analysis suggests + * that INSTRUCTOR is robust to changes in instructions, and that instruction finetuning + * mitigates the challenge of training a single model on diverse datasets. Our model, code, and + * data are available at this https URL. [[https://instructor-embedding.github.io/]] '' + * + * ==Example== + * {{{ + * import spark.implicits._ + * import com.johnsnowlabs.nlp.base.DocumentAssembler + * import com.johnsnowlabs.nlp.annotators.Tokenizer + * import com.johnsnowlabs.nlp.embeddings.InstructorEmbeddings + * import com.johnsnowlabs.nlp.EmbeddingsFinisher + * import org.apache.spark.ml.Pipeline + * + * val documentAssembler = new DocumentAssembler() + * .setInputCol("text") + * .setOutputCol("document") + * + * val embeddings = InstructorEmbeddings.pretrained("instructor_base", "en") + * .setInputCols("document") + * .setInstruction("Represent the Medicine sentence for clustering: ") + * .setOutputCol("instructor_embeddings") + * + * val embeddingsFinisher = new EmbeddingsFinisher() + * .setInputCols("instructor_embeddings") + * .setOutputCols("finished_embeddings") + * .setOutputAsVector(true) + * + * val pipeline = new Pipeline().setStages(Array( + * documentAssembler, + * embeddings, + * embeddingsFinisher + * )) + * + * val data = Seq("Dynamical Scalar Degree of Freedom in Horava-Lifshitz Gravity").toDF("text") + * val result = pipeline.fit(data).transform(data) + * + * result.selectExpr("explode(finished_embeddings) as result").show(1, 80) + * +--------------------------------------------------------------------------------+ + * | result| + * +--------------------------------------------------------------------------------+ + * |[-2.3497989177703857,0.480538547039032,-0.3238905668258667,-1.612930893898010...| + * +--------------------------------------------------------------------------------+ + * }}} + * + * @see + * [[https://sparknlp.org/docs/en/annotators Annotators Main Page]] for a list of transformer + * based embeddings + * @param uid + * required uid for storing annotator to disk + * @groupname anno Annotator types + * @groupdesc anno + * Required input and expected output annotator types + * @groupname Ungrouped Members + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupname Ungrouped Members + * @groupprio param 1 + * @groupprio anno 2 + * @groupprio Ungrouped 3 + * @groupprio setParam 4 + * @groupprio getParam 5 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ +class InstructorEmbeddings(override val uid: String) + extends AnnotatorModel[InstructorEmbeddings] + with HasBatchedAnnotate[InstructorEmbeddings] + with WriteTensorflowModel + with HasEmbeddingsProperties + with HasStorageRef + with WriteSentencePieceModel + with HasCaseSensitiveProperties + with HasEngine { + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + override val inputAnnotatorTypes: Array[String] = + Array(AnnotatorType.DOCUMENT) + override val outputAnnotatorType: AnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS + + /** ConfigProto from tensorflow, serialized into byte array. Get with + * `config_proto.SerializeToString()` + * + * @group param + */ + val configProtoBytes = new IntArrayParam( + this, + "configProtoBytes", + "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()") + + /** Max sentence length to process (Default: `128`) + * + * @group param + */ + val maxSentenceLength = + new IntParam(this, "maxSentenceLength", "Max sentence length to process") + + /** Set transformer instruction, e.g. 'summarize' format: `"instruction:"`. + * + * @group param + */ + val instruction = + new Param[String](this, "instruction", "Set transformer instruction, e.g. 'summarize'") + + /** It contains TF model signatures for the laded saved model + * + * @group param + */ + val signatures = + new MapFeature[String, String](model = this, name = "signatures").setProtected() + private var _model: Option[Broadcast[Instructor]] = None + + def this() = this(Identifiable.randomUID("INSTRUCTOR_EMBEDDINGS")) + + /** @group setParam */ + def setConfigProtoBytes(bytes: Array[Int]): InstructorEmbeddings.this.type = + set(this.configProtoBytes, bytes) + + /** @group setParam */ + def setMaxSentenceLength(value: Int): this.type = { + require( + value <= 512, + "Instructor models do not support sequences longer than 512 because of trainable positional embeddings.") + require(value >= 1, "The maxSentenceLength must be at least 1") + set(maxSentenceLength, value) + this + } + + /** @group getParam */ + def getMaxSentenceLength: Int = $(maxSentenceLength) + + def setInstruction(value: String): InstructorEmbeddings.this.type = { + set(instruction, value) + this + } + + /** @group setParam */ + def setSignatures(value: Map[String, String]): this.type = { + if (get(signatures).isEmpty) + set(signatures, value) + this + } + + /** @group setParam */ + def setModelIfNotSet( + spark: SparkSession, + tensorflowWrapper: TensorflowWrapper, + spp: SentencePieceWrapper): InstructorEmbeddings = { + if (_model.isEmpty) { + _model = Some( + spark.sparkContext.broadcast( + new Instructor( + tensorflowWrapper, + spp = spp, + configProtoBytes = getConfigProtoBytes, + signatures = getSignatures))) + } + + this + } + + /** Set Embeddings dimensions for the BERT model Only possible to set this when the first time + * is saved dimension is not changeable, it comes from BERT config file + * + * @group setParam + */ + override def setDimension(value: Int): this.type = { + if (get(dimension).isEmpty) + set(this.dimension, value) + this + } + + /** Whether to lowercase tokens or not + * + * @group setParam + */ + override def setCaseSensitive(value: Boolean): this.type = { + if (get(caseSensitive).isEmpty) + set(this.caseSensitive, value) + this + } + + setDefault( + dimension -> 768, + batchSize -> 8, + maxSentenceLength -> 128, + caseSensitive -> false, + instruction -> "") + + /** takes a document and annotations and produces new annotations of this annotator's annotation + * type + * + * @param batchedAnnotations + * Annotations that correspond to inputAnnotationCols generated by previous annotators if any + * @return + * any number of annotations processed for every input annotation. Not necessary one to one + * relationship + */ + override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = { + + val allAnnotations = batchedAnnotations + .filter(_.nonEmpty) + .zipWithIndex + .flatMap { case (annotations, i) => + annotations.filter(_.result.nonEmpty).map(x => (x, i)) + } + val processedAnnotations = if (allAnnotations.nonEmpty) { + this.getModelIfNotSet.predict( + sentences = allAnnotations.map(_._1), + batchSize = $(batchSize), + maxSentenceLength = $(maxSentenceLength), + instruction = $(instruction)) + } else { + Seq() + } + + // Group resulting annotations by rows. If there are not sentences in a given row, return empty sequence + batchedAnnotations.indices.map(rowIndex => { + val rowAnnotations = processedAnnotations + // zip each annotation with its corresponding row index + .zip(allAnnotations) + // select the sentences belonging to the current row + .filter(_._2._2 == rowIndex) + // leave the annotation only + .map(_._1) + + if (rowAnnotations.nonEmpty) + rowAnnotations + else + Seq.empty[Annotation] + }) + + } + + /** @group getParam */ + def getModelIfNotSet: Instructor = _model.get.value + + override def onWrite(path: String, spark: SparkSession): Unit = { + super.onWrite(path, spark) + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflow, + "_instructor", + InstructorEmbeddings.tfFile, + configProtoBytes = getConfigProtoBytes, + savedSignatures = getSignatures) + writeSentencePieceModel( + path, + spark, + getModelIfNotSet.spp, + "_instructor", + InstructorEmbeddings.sppFile) + + } + + /** @group getParam */ + def getConfigProtoBytes: Option[Array[Byte]] = get(this.configProtoBytes).map(_.map(_.toByte)) + + /** @group getParam */ + def getSignatures: Option[Map[String, String]] = get(this.signatures) + + override protected def afterAnnotate(dataset: DataFrame): DataFrame = { + dataset.withColumn( + getOutputCol, + wrapSentenceEmbeddingsMetadata( + dataset.col(getOutputCol), + $(dimension), + Some($(storageRef)))) + } + +} + +trait ReadablePretrainedInstructorModel + extends ParamsAndFeaturesReadable[InstructorEmbeddings] + with HasPretrained[InstructorEmbeddings] { + override val defaultModelName: Some[String] = Some("instructor_base") + + /** Java compliant-overrides */ + override def pretrained(): InstructorEmbeddings = super.pretrained() + + override def pretrained(name: String): InstructorEmbeddings = super.pretrained(name) + + override def pretrained(name: String, lang: String): InstructorEmbeddings = + super.pretrained(name, lang) + + override def pretrained(name: String, lang: String, remoteLoc: String): InstructorEmbeddings = + super.pretrained(name, lang, remoteLoc) +} + +trait ReadInstructorDLModel extends ReadTensorflowModel with ReadSentencePieceModel { + this: ParamsAndFeaturesReadable[InstructorEmbeddings] => + + override val tfFile: String = "instructor_tensorflow" + override val sppFile: String = "instructor_spp" + def readModel(instance: InstructorEmbeddings, path: String, spark: SparkSession): Unit = { + + val tf = readTensorflowModel( + path, + spark, + "_instructor_tf", + savedSignatures = instance.getSignatures, + initAllTables = false) + val spp = readSentencePieceModel(path, spark, "_instructor_spp", sppFile) + instance.setModelIfNotSet(spark, tf, spp) + } + + addReader(readModel) + + def loadSavedModel(modelPath: String, spark: SparkSession): InstructorEmbeddings = { + + val (localModelPath, detectedEngine) = modelSanityCheck(modelPath) + + /*Universal parameters for all engines*/ + val annotatorModel = new InstructorEmbeddings() + + annotatorModel.set(annotatorModel.engine, detectedEngine) + val spModel = loadSentencePieceAsset(localModelPath, "spiece.model") + detectedEngine match { + case ModelEngine.tensorflow => + val (wrapper, signatures) = TensorflowWrapper.read( + localModelPath, + zipped = false, + useBundle = true, + tags = Array("serve"), + initAllTables = false) + + val _signatures = signatures match { + case Some(s) => s + case None => throw new Exception("Cannot load signature definitions from model!") + } + + /** the order of setSignatures is important if we use getSignatures inside + * setModelIfNotSet + */ + annotatorModel + .setSignatures(_signatures) + .setModelIfNotSet(spark, wrapper, spModel) + + case _ => + throw new Exception(notSupportedEngineError) + } + + annotatorModel + } +} + +/** This is the companion object of [[InstructorEmbeddings]]. Please refer to that class for the + * documentation. + */ +object InstructorEmbeddings + extends ReadablePretrainedInstructorModel + with ReadInstructorDLModel + with ReadSentencePieceModel { + private[InstructorEmbeddings] val logger: Logger = + LoggerFactory.getLogger("InstructorEmbeddings") +} diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddingsTestSpec.scala new file mode 100644 index 00000000000000..717dc494e0c120 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddingsTestSpec.scala @@ -0,0 +1,65 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.embeddings + +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.tags.SlowTest +import org.apache.spark.ml.Pipeline +import org.scalatest.flatspec.AnyFlatSpec + +class InstructorEmbeddingsTestSpec extends AnyFlatSpec { + + "Instructor Embeddings" should "correctly embed multiple sentences" taggedAs SlowTest in { + + import ResourceHelper.spark.implicits._ + + val ddd = Seq( + "Capitalism has been dominant in the Western world since the end of feudalism, but most feel[who?]" + + " that the term \"mixed economies\" more precisely describes most contemporary economies, due to their " + + "containing both private-owned and state-owned enterprises. In capitalism, prices determine the " + + "demand-supply scale. For example, higher demand for certain goods and services lead to higher prices " + + "and lower demand for certain goods lead to lower prices.", + "The disparate impact theory is especially controversial under the Fair Housing Act because the Act " + + "regulates many activities relating to housing, insurance, and mortgage loans—and some scholars" + + " have argued that the theory's use under the Fair Housing Act, combined with extensions of the " + + "Community Reinvestment Act, contributed to rise of sub-prime lending and the crash of the U.S. " + + "housing market and ensuing global economic recession", + "Disparate impact in United States labor law refers to practices in employment, housing, and other" + + " areas that adversely affect one group of people of a protected characteristic more than another, " + + "even though rules applied by employers or landlords are formally neutral. Although the protected classes " + + "vary by statute, most federal civil rights laws protect based on race, color, religion, national origin, " + + "and sex as protected traits, and some laws include disability status and other traits as well.") + .toDF("text") + + val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val embeddings = InstructorEmbeddings + .pretrained() + .setInstruction("Represent the Wikipedia document for retrieval: ") + .setInputCols(Array("document")) + .setOutputCol("instructor") + + val pipeline = new Pipeline().setStages(Array(document, embeddings)) + + val pipelineDF = pipeline.fit(ddd).transform(ddd) + pipelineDF.select("instructor.embeddings").show(truncate = false) + + } +} From c2dd80db4b88aa5a582dd36fd7297ae94431ad94 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Sat, 1 Jul 2023 15:09:47 +0200 Subject: [PATCH 04/13] =?UTF-8?q?Integrating=20ONNX=20runtime=20(ORT)=20in?= =?UTF-8?q?=20Spark=20NLP=205.0.0=20=F0=9F=8E=89=20(#13857)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add ONNX Runtime to the dependencies * Add both CPU and GPU coordinates for onnxruntime * Implement OnnxSerializeModel * Implement OnnxWrapper * Update error message for loading external models * Add support for ONNX to BertEmbeddings annotator * Add support for ONNX to BERT backend * Add support for ONNX to DeBERTa * Implement ONNX in DeBERTa backend * Adapt Bert For sentence embeddings with the new backend * Update unit test for BERT (temp) * Update unit test for DeBERTa (temp) * Update onnxruntime and google cloud dependencies * Seems Apple Silicon and Aarch64 are supported in onnxruntime * Cleaning up * Remove bad merge * Update BERT unit test * Add fix me to the try * Making withSafeOnnxModelLoader thread safe * update onnxruntime * Revert back to normal unit tests for now [ski ptest] * Added ADT for ModelEngine (#13862) Co-authored-by: Stefano Lori * Optimize ONNX on CPU * refactor * Add ONNX support to DistilBERT * Add support for ONNX in RoBERTa * Fix the bad serialization on write * Fix using the wrong object --------- Co-authored-by: Stefano Lori Co-authored-by: Stefano Lori --- build.sbt | 11 + project/Dependencies.scala | 5 +- .../scala/com/johnsnowlabs/ml/ai/Bert.scala | 280 ++++++++++++------ .../com/johnsnowlabs/ml/ai/DeBerta.scala | 143 ++++++--- .../com/johnsnowlabs/ml/ai/DistilBert.scala | 123 +++++--- .../com/johnsnowlabs/ml/ai/RoBerta.scala | 124 +++++--- .../ml/onnx/OnnxSerializeModel.scala | 98 ++++++ .../johnsnowlabs/ml/onnx/OnnxWrapper.scala | 162 ++++++++++ .../ml/util/LoadExternalModel.scala | 28 +- .../johnsnowlabs/ml/util/ModelEngine.scala | 31 +- .../com/johnsnowlabs/nlp/HasEngine.scala | 4 +- .../nlp/annotators/audio/HubertForCTC.scala | 4 +- .../nlp/annotators/audio/Wav2Vec2ForCTC.scala | 4 +- .../dl/AlbertForQuestionAnswering.scala | 4 +- .../dl/AlbertForSequenceClassification.scala | 4 +- .../dl/AlbertForTokenClassification.scala | 4 +- .../dl/BertForQuestionAnswering.scala | 4 +- .../dl/BertForSequenceClassification.scala | 4 +- .../dl/BertForTokenClassification.scala | 4 +- .../dl/BertForZeroShotClassification.scala | 8 +- .../dl/CamemBertForQuestionAnswering.scala | 4 +- .../CamemBertForSequenceClassification.scala | 4 +- .../dl/CamemBertForTokenClassification.scala | 4 +- .../dl/DeBertaForQuestionAnswering.scala | 4 +- .../dl/DeBertaForSequenceClassification.scala | 4 +- .../dl/DeBertaForTokenClassification.scala | 4 +- .../dl/DistilBertForQuestionAnswering.scala | 4 +- .../DistilBertForSequenceClassification.scala | 4 +- .../dl/DistilBertForTokenClassification.scala | 4 +- .../DistilBertForZeroShotClassification.scala | 4 +- .../dl/LongformerForQuestionAnswering.scala | 4 +- .../LongformerForSequenceClassification.scala | 4 +- .../dl/LongformerForTokenClassification.scala | 4 +- .../dl/RoBertaForQuestionAnswering.scala | 4 +- .../dl/RoBertaForSequenceClassification.scala | 4 +- .../dl/RoBertaForTokenClassification.scala | 4 +- .../dl/RoBertaForZeroShotClassification.scala | 4 +- .../dl/TapasForQuestionAnswering.scala | 4 +- .../dl/XlmRoBertaForQuestionAnswering.scala | 4 +- .../XlmRoBertaForSequenceClassification.scala | 4 +- .../dl/XlmRoBertaForTokenClassification.scala | 4 +- .../dl/XlnetForSequenceClassification.scala | 4 +- .../dl/XlnetForTokenClassification.scala | 4 +- .../annotators/coref/SpanBertCorefModel.scala | 4 +- .../cv/ConvNextForImageClassification.scala | 4 +- .../cv/SwinForImageClassification.scala | 4 +- .../cv/ViTForImageClassification.scala | 6 +- .../annotators/ld/dl/LanguageDetectorDL.scala | 4 +- .../annotators/seq2seq/BartTransformer.scala | 4 +- .../annotators/seq2seq/GPT2Transformer.scala | 4 +- .../seq2seq/MarianTransformer.scala | 4 +- .../annotators/seq2seq/T5Transformer.scala | 4 +- .../nlp/embeddings/AlbertEmbeddings.scala | 4 +- .../nlp/embeddings/BertEmbeddings.scala | 91 ++++-- .../embeddings/BertSentenceEmbeddings.scala | 79 +++-- .../nlp/embeddings/CamemBertEmbeddings.scala | 4 +- .../nlp/embeddings/DeBertaEmbeddings.scala | 88 ++++-- .../nlp/embeddings/DistilBertEmbeddings.scala | 93 ++++-- .../nlp/embeddings/ElmoEmbeddings.scala | 4 +- .../nlp/embeddings/LongformerEmbeddings.scala | 15 +- .../nlp/embeddings/RoBertaEmbeddings.scala | 65 +++- .../RoBertaSentenceEmbeddings.scala | 16 +- .../embeddings/UniversalSentenceEncoder.scala | 4 +- .../nlp/embeddings/XlmRoBertaEmbeddings.scala | 4 +- .../XlmRoBertaSentenceEmbeddings.scala | 4 +- .../nlp/embeddings/XlnetEmbeddings.scala | 4 +- 66 files changed, 1199 insertions(+), 455 deletions(-) create mode 100644 src/main/scala/com/johnsnowlabs/ml/onnx/OnnxSerializeModel.scala create mode 100644 src/main/scala/com/johnsnowlabs/ml/onnx/OnnxWrapper.scala diff --git a/build.sbt b/build.sbt index 7821eecfcc413b..c9e37ecd4a699e 100644 --- a/build.sbt +++ b/build.sbt @@ -165,6 +165,16 @@ val tensorflowDependencies: Seq[sbt.ModuleID] = else Seq(tensorflowCPU) +val onnxDependencies: Seq[sbt.ModuleID] = + if (is_gpu.equals("true")) + Seq(onnxGPU) + else if (is_silicon.equals("true")) + Seq(onnxCPU) + else if (is_aarch64.equals("true")) + Seq(onnxCPU) + else + Seq(onnxCPU) + lazy val mavenProps = settingKey[Unit]("workaround for Maven properties") lazy val root = (project in file(".")) @@ -175,6 +185,7 @@ lazy val root = (project in file(".")) testDependencies ++ utilDependencies ++ tensorflowDependencies ++ + onnxDependencies ++ typedDependencyParserDependencies, // TODO potentially improve this? mavenProps := { diff --git a/project/Dependencies.scala b/project/Dependencies.scala index f36d7f528d3c54..99b725d3b51c76 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -108,7 +108,10 @@ object Dependencies { val tensorflowM1 = "com.johnsnowlabs.nlp" %% "tensorflow-m1" % tensorflowVersion val tensorflowLinuxAarch64 = "com.johnsnowlabs.nlp" %% "tensorflow-aarch64" % tensorflowVersion - val gcpStorageVersion = "2.16.0" + val onnxRuntimeVersion = "1.15.0" + val onnxCPU = "com.microsoft.onnxruntime" % "onnxruntime" % onnxRuntimeVersion + val onnxGPU = "com.microsoft.onnxruntime" % "onnxruntime_gpu" % onnxRuntimeVersion + val gcpStorageVersion = "2.20.1" val gcpStorage = "com.google.cloud" % "google-cloud-storage" % gcpStorageVersion /** ------- Dependencies end ------- */ diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Bert.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Bert.scala index ed704f8c3ef476..c291b9f23c549d 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/Bert.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/Bert.scala @@ -16,10 +16,12 @@ package com.johnsnowlabs.ml.ai +import ai.onnxruntime.OnnxTensor import com.johnsnowlabs.ml.ai.util.PrepareEmbeddings +import com.johnsnowlabs.ml.onnx.OnnxWrapper import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} -import com.johnsnowlabs.ml.util.ModelArch +import com.johnsnowlabs.ml.util.{ModelArch, ONNX, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.{Annotation, AnnotatorType} @@ -35,6 +37,8 @@ import scala.collection.JavaConverters._ * * @param tensorflowWrapper * Bert Model wrapper with TensorFlow Wrapper + * @param onnxWrapper + * Bert Model wrapper with ONNX Wrapper * @param sentenceStartTokenId * Id of sentence start Token * @param sentenceEndTokenId @@ -47,7 +51,8 @@ import scala.collection.JavaConverters._ * Source: [[https://github.com/google-research/bert]] */ private[johnsnowlabs] class Bert( - val tensorflowWrapper: TensorflowWrapper, + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], sentenceStartTokenId: Int, sentenceEndTokenId: Int, configProtoBytes: Option[Array[Byte]] = None, @@ -57,6 +62,10 @@ private[johnsnowlabs] class Bert( extends Serializable { val _tfBertSignatures: Map[String, String] = signatures.getOrElse(ModelSignatureManager.apply()) + val detectedEngine: String = + if (tensorflowWrapper.isDefined) TensorFlow.name + else if (onnxWrapper.isDefined) ONNX.name + else TensorFlow.name private def sessionWarmup(): Unit = { val dummyInput = @@ -74,51 +83,99 @@ private[johnsnowlabs] class Bert( sessionWarmup() def tag(batch: Seq[Array[Int]]): Seq[Array[Array[Float]]] = { - val maxSentenceLength = batch.map(pieceIds => pieceIds.length).max val batchLength = batch.length - val tensors = new TensorResources() - - val (tokenTensors, maskTensors, segmentTensors) = - PrepareEmbeddings.prepareBatchTensorsWithSegment( - tensors = tensors, - batch = batch, - maxSentenceLength = maxSentenceLength, - batchLength = batchLength) - - val runner = tensorflowWrapper - .getTFSessionWithSignature( - configProtoBytes = configProtoBytes, - savedSignatures = signatures, - initAllTables = false) - .runner - - runner - .feed( - _tfBertSignatures.getOrElse( - ModelSignatureConstants.InputIdsV1.key, - "missing_input_id_key"), - tokenTensors) - .feed( - _tfBertSignatures - .getOrElse(ModelSignatureConstants.AttentionMaskV1.key, "missing_input_mask_key"), - maskTensors) - .feed( - _tfBertSignatures - .getOrElse(ModelSignatureConstants.TokenTypeIdsV1.key, "missing_segment_ids_key"), - segmentTensors) - .fetch(_tfBertSignatures - .getOrElse(ModelSignatureConstants.LastHiddenStateV1.key, "missing_sequence_output_key")) - - val outs = runner.run().asScala - val embeddings = TensorResources.extractFloats(outs.head) + val embeddings = detectedEngine match { + + case ONNX.name => + // [nb of encoded sentences , maxSentenceLength] + val (runner, env) = onnxWrapper.get.getSession() + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val segmentTensors = + OnnxTensor.createTensor(env, batch.map(x => Array.fill(maxSentenceLength)(0L)).toArray) + + val inputs = + Map( + "input_ids" -> tokenTensors, + "attention_mask" -> maskTensors, + "token_type_ids" -> segmentTensors).asJava + + // TODO: A try without a catch or finally is equivalent to putting its body in a block; no exceptions are handled. + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get("last_hidden_state") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() + segmentTensors.close() + // runner.close() + // env.close() + // + embeddings + } finally if (results != null) results.close() + } + case _ => + val tensors = new TensorResources() + + val (tokenTensors, maskTensors, segmentTensors) = + PrepareEmbeddings.prepareBatchTensorsWithSegment( + tensors, + batch, + maxSentenceLength, + batchLength) + + val runner = tensorflowWrapper.get + .getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + .runner + + runner + .feed( + _tfBertSignatures.getOrElse( + ModelSignatureConstants.InputIdsV1.key, + "missing_input_id_key"), + tokenTensors) + .feed( + _tfBertSignatures + .getOrElse(ModelSignatureConstants.AttentionMaskV1.key, "missing_input_mask_key"), + maskTensors) + .feed( + _tfBertSignatures + .getOrElse(ModelSignatureConstants.TokenTypeIdsV1.key, "missing_segment_ids_key"), + segmentTensors) + .fetch( + _tfBertSignatures + .getOrElse( + ModelSignatureConstants.LastHiddenStateV1.key, + "missing_sequence_output_key")) + + val outs = runner.run().asScala + val embeddings = TensorResources.extractFloats(outs.head) + + tokenTensors.close() + maskTensors.close() + segmentTensors.close() + tensors.clearSession(outs) + tensors.clearTensors() + + embeddings - tokenTensors.close() - maskTensors.close() - segmentTensors.close() - tensors.clearSession(outs) - tensors.clearTensors() + } PrepareEmbeddings.prepareBatchWordEmbeddings( batch, @@ -133,48 +190,91 @@ private[johnsnowlabs] class Bert( val maxSentenceLength = batch.map(pieceIds => pieceIds.length).max val batchLength = batch.length - val tensors = new TensorResources() - - val (tokenTensors, maskTensors, segmentTensors) = - PrepareEmbeddings.prepareBatchTensorsWithSegment( - tensors = tensors, - batch = batch, - maxSentenceLength = maxSentenceLength, - batchLength = batchLength) - - val runner = tensorflowWrapper - .getTFSessionWithSignature( - configProtoBytes = configProtoBytes, - savedSignatures = signatures, - initAllTables = false) - .runner - - runner - .feed( - _tfBertSignatures.getOrElse( - ModelSignatureConstants.InputIdsV1.key, - "missing_input_id_key"), - tokenTensors) - .feed( - _tfBertSignatures - .getOrElse(ModelSignatureConstants.AttentionMaskV1.key, "missing_input_mask_key"), - maskTensors) - .feed( - _tfBertSignatures - .getOrElse(ModelSignatureConstants.TokenTypeIdsV1.key, "missing_segment_ids_key"), - segmentTensors) - .fetch(_tfBertSignatures - .getOrElse(ModelSignatureConstants.PoolerOutput.key, "missing_pooled_output_key")) - - val outs = runner.run().asScala - val embeddings = TensorResources.extractFloats(outs.head) - - tokenTensors.close() - maskTensors.close() - segmentTensors.close() - tensors.clearSession(outs) - tensors.clearTensors() + val embeddings = detectedEngine match { + case ONNX.name => + // [nb of encoded sentences , maxSentenceLength] + val (runner, env) = onnxWrapper.get.getSession() + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val segmentTensors = + OnnxTensor.createTensor(env, batch.map(x => Array.fill(maxSentenceLength)(0L)).toArray) + + val inputs = + Map( + "input_ids" -> tokenTensors, + "attention_mask" -> maskTensors, + "token_type_ids" -> segmentTensors).asJava + + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get("pooler_output") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() + segmentTensors.close() + // runner.close() + // env.close() + // + embeddings + } finally if (results != null) results.close() + } + case _ => + val tensors = new TensorResources() + + val (tokenTensors, maskTensors, segmentTensors) = + PrepareEmbeddings.prepareBatchTensorsWithSegment( + tensors, + batch, + maxSentenceLength, + batchLength) + + val runner = tensorflowWrapper.get + .getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + .runner + + runner + .feed( + _tfBertSignatures.getOrElse( + ModelSignatureConstants.InputIdsV1.key, + "missing_input_id_key"), + tokenTensors) + .feed( + _tfBertSignatures + .getOrElse(ModelSignatureConstants.AttentionMaskV1.key, "missing_input_mask_key"), + maskTensors) + .feed( + _tfBertSignatures + .getOrElse(ModelSignatureConstants.TokenTypeIdsV1.key, "missing_segment_ids_key"), + segmentTensors) + .fetch(_tfBertSignatures + .getOrElse(ModelSignatureConstants.PoolerOutput.key, "missing_pooled_output_key")) + + val outs = runner.run().asScala + val embeddings = TensorResources.extractFloats(outs.head) + + tokenTensors.close() + maskTensors.close() + segmentTensors.close() + tensors.clearSession(outs) + tensors.clearTensors() + + embeddings + } val dim = embeddings.length / batchLength embeddings.grouped(dim).toArray @@ -200,17 +300,17 @@ private[johnsnowlabs] class Bert( segmentBuffers.offset(offset).write(Array.fill(maxSentenceLength)(0L)) } - val runner = tensorflowWrapper + val tokenTensors = tensors.createLongBufferTensor(shape, tokenBuffers) + val maskTensors = tensors.createLongBufferTensor(shape, maskBuffers) + val segmentTensors = tensors.createLongBufferTensor(shape, segmentBuffers) + + val runner = tensorflowWrapper.get .getTFSessionWithSignature( configProtoBytes = configProtoBytes, savedSignatures = signatures, initAllTables = false) .runner - val tokenTensors = tensors.createLongBufferTensor(shape, tokenBuffers) - val maskTensors = tensors.createLongBufferTensor(shape, maskBuffers) - val segmentTensors = tensors.createLongBufferTensor(shape, segmentBuffers) - runner .feed( _tfBertSignatures.getOrElse( @@ -257,7 +357,6 @@ private[johnsnowlabs] class Bert( maxSentenceLength, sentenceStartTokenId, sentenceEndTokenId) - val vectors = tag(encoded) /*Combine tokens and calculated embeddings*/ @@ -324,7 +423,6 @@ private[johnsnowlabs] class Bert( maxSentenceLength, sentenceStartTokenId, sentenceEndTokenId) - val embeddings = if (isLong) { tagSequenceSBert(encoded) } else { diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/DeBerta.scala b/src/main/scala/com/johnsnowlabs/ml/ai/DeBerta.scala index 2f2638c5acd65e..bbf4ac83b1862b 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/DeBerta.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/DeBerta.scala @@ -16,10 +16,13 @@ package com.johnsnowlabs.ml.ai +import ai.onnxruntime.OnnxTensor import com.johnsnowlabs.ml.ai.util.PrepareEmbeddings -import com.johnsnowlabs.ml.tensorflow.sentencepiece.{SentencePieceWrapper, SentencepieceEncoder} +import com.johnsnowlabs.ml.onnx.OnnxWrapper +import com.johnsnowlabs.ml.tensorflow.sentencepiece._ import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ import scala.collection.JavaConverters._ @@ -34,7 +37,8 @@ import scala.collection.JavaConverters._ * Configuration for TensorFlow session */ class DeBerta( - val tensorflowWrapper: TensorflowWrapper, + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], val spp: SentencePieceWrapper, batchSize: Int, configProtoBytes: Option[Array[Byte]] = None, @@ -44,6 +48,11 @@ class DeBerta( val _tfDeBertaSignatures: Map[String, String] = signatures.getOrElse(ModelSignatureManager.apply()) + val detectedEngine: String = + if (tensorflowWrapper.isDefined) TensorFlow.name + else if (onnxWrapper.isDefined) ONNX.name + else TensorFlow.name + // keys representing the input and output tensors of the DeBERTa model private val SentenceStartTokenId = spp.getSppModel.pieceToId("[CLS]") private val SentenceEndTokenId = spp.getSppModel.pieceToId("[SEP]") @@ -51,52 +60,96 @@ class DeBerta( private val SentencePieceDelimiterId = spp.getSppModel.pieceToId("▁") def tag(batch: Seq[Array[Int]]): Seq[Array[Array[Float]]] = { - + /* Actual size of each sentence to skip padding in the TF model */ val maxSentenceLength = batch.map(pieceIds => pieceIds.length).max val batchLength = batch.length - val tensors = new TensorResources() - - val (tokenTensors, maskTensors, segmentTensors) = - PrepareEmbeddings.prepareBatchTensorsWithSegment( - tensors = tensors, - batch = batch, - maxSentenceLength = maxSentenceLength, - batchLength = batchLength, - sentencePadTokenId = SentencePadTokenId) - - val runner = tensorflowWrapper - .getTFSessionWithSignature( - configProtoBytes = configProtoBytes, - savedSignatures = signatures, - initAllTables = false) - .runner - - runner - .feed( - _tfDeBertaSignatures.getOrElse( - ModelSignatureConstants.InputIds.key, - "missing_input_id_key"), - tokenTensors) - .feed( - _tfDeBertaSignatures - .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), - maskTensors) - .feed( - _tfDeBertaSignatures - .getOrElse(ModelSignatureConstants.TokenTypeIds.key, "missing_segment_ids_key"), - segmentTensors) - .fetch(_tfDeBertaSignatures - .getOrElse(ModelSignatureConstants.LastHiddenState.key, "missing_sequence_output_key")) - - val outs = runner.run().asScala - val embeddings = TensorResources.extractFloats(outs.head) - - tokenTensors.close() - maskTensors.close() - segmentTensors.close() - tensors.clearSession(outs) - tensors.clearTensors() + val embeddings = detectedEngine match { + + case ONNX.name => + // [nb of encoded sentences , maxSentenceLength] + val (runner, env) = onnxWrapper.get.getSession() + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val segmentTensors = + OnnxTensor.createTensor(env, batch.map(x => Array.fill(maxSentenceLength)(0L)).toArray) + + val inputs = + Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava + + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get("last_hidden_state") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() + segmentTensors.close() + // runner.close() + // env.close() + // + embeddings + } finally if (results != null) results.close() + } + case _ => + val tensors = new TensorResources() + + val (tokenTensors, maskTensors, segmentTensors) = + PrepareEmbeddings.prepareBatchTensorsWithSegment( + tensors, + batch, + maxSentenceLength, + batchLength) + + val runner = tensorflowWrapper.get + .getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + .runner + + runner + .feed( + _tfDeBertaSignatures.getOrElse( + ModelSignatureConstants.InputIds.key, + "missing_input_id_key"), + tokenTensors) + .feed( + _tfDeBertaSignatures + .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), + maskTensors) + .feed( + _tfDeBertaSignatures + .getOrElse(ModelSignatureConstants.TokenTypeIds.key, "missing_segment_ids_key"), + segmentTensors) + .fetch( + _tfDeBertaSignatures + .getOrElse( + ModelSignatureConstants.LastHiddenState.key, + "missing_sequence_output_key")) + + val outs = runner.run().asScala + val embeddings = TensorResources.extractFloats(outs.head) + + tokenTensors.close() + maskTensors.close() + segmentTensors.close() + tensors.clearSession(outs) + tensors.clearTensors() + + embeddings + + } PrepareEmbeddings.prepareBatchWordEmbeddings( batch, diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/DistilBert.scala b/src/main/scala/com/johnsnowlabs/ml/ai/DistilBert.scala index 3e0d9a022a52cf..afa6a3b8bb29d5 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/DistilBert.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/DistilBert.scala @@ -16,10 +16,12 @@ package com.johnsnowlabs.ml.ai +import ai.onnxruntime.OnnxTensor import com.johnsnowlabs.ml.ai.util.PrepareEmbeddings +import com.johnsnowlabs.ml.onnx.OnnxWrapper import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} -import com.johnsnowlabs.ml.util.ModelArch +import com.johnsnowlabs.ml.util.{ModelArch, ONNX, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.{Annotation, AnnotatorType} @@ -66,7 +68,8 @@ import scala.collection.JavaConverters._ * Configuration for TensorFlow session */ private[johnsnowlabs] class DistilBert( - val tensorflowWrapper: TensorflowWrapper, + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], sentenceStartTokenId: Int, sentenceEndTokenId: Int, configProtoBytes: Option[Array[Byte]] = None, @@ -75,6 +78,10 @@ private[johnsnowlabs] class DistilBert( extends Serializable { val _tfBertSignatures: Map[String, String] = signatures.getOrElse(ModelSignatureManager.apply()) + val detectedEngine: String = + if (tensorflowWrapper.isDefined) TensorFlow.name + else if (onnxWrapper.isDefined) ONNX.name + else TensorFlow.name private def sessionWarmup(): Unit = { val dummyInput = @@ -93,46 +100,88 @@ private[johnsnowlabs] class DistilBert( val maxSentenceLength = batch.map(pieceIds => pieceIds.length).max val batchLength = batch.length - val tensors = new TensorResources() - - val (tokenTensors, maskTensors) = - PrepareEmbeddings.prepareBatchTensors( - tensors = tensors, - batch = batch, - maxSentenceLength = maxSentenceLength, - batchLength = batchLength) - - val runner = tensorflowWrapper - .getTFSessionWithSignature( - configProtoBytes = configProtoBytes, - savedSignatures = signatures, - initAllTables = false) - .runner - - runner - .feed( - _tfBertSignatures.getOrElse(ModelSignatureConstants.InputIds.key, "missing_input_id_key"), - tokenTensors) - .feed( - _tfBertSignatures - .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), - maskTensors) - .fetch(_tfBertSignatures - .getOrElse(ModelSignatureConstants.LastHiddenState.key, "missing_sequence_output_key")) - - val outs = runner.run().asScala - val embeddings = TensorResources.extractFloats(outs.head) - - tokenTensors.close() - maskTensors.close() - tensors.clearSession(outs) - tensors.clearTensors() + val embeddings = detectedEngine match { + case ONNX.name => + // [nb of encoded sentences , maxSentenceLength] + val (runner, env) = onnxWrapper.get.getSession() + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val inputs = + Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava + + // TODO: A try without a catch or finally is equivalent to putting its body in a block; no exceptions are handled. + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get("last_hidden_state") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() + + embeddings + } finally if (results != null) results.close() + } + case _ => + val tensors = new TensorResources() + + val (tokenTensors, maskTensors, segmentTensors) = + PrepareEmbeddings.prepareBatchTensorsWithSegment( + tensors, + batch, + maxSentenceLength, + batchLength) + + val runner = tensorflowWrapper.get + .getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + .runner + + runner + .feed( + _tfBertSignatures.getOrElse( + ModelSignatureConstants.InputIds.key, + "missing_input_id_key"), + tokenTensors) + .feed( + _tfBertSignatures + .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), + maskTensors) + .fetch( + _tfBertSignatures + .getOrElse( + ModelSignatureConstants.LastHiddenState.key, + "missing_sequence_output_key")) + + val outs = runner.run().asScala + val embeddings = TensorResources.extractFloats(outs.head) + + tokenTensors.close() + maskTensors.close() + segmentTensors.close() + tensors.clearSession(outs) + tensors.clearTensors() + + embeddings + } PrepareEmbeddings.prepareBatchWordEmbeddings( batch, embeddings, maxSentenceLength, batchLength) + } /** @param batch @@ -154,7 +203,7 @@ private[johnsnowlabs] class DistilBert( maxSentenceLength = maxSentenceLength, batchLength = batchLength) - val runner = tensorflowWrapper + val runner = tensorflowWrapper.get .getTFSessionWithSignature( configProtoBytes = configProtoBytes, savedSignatures = signatures, diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/RoBerta.scala b/src/main/scala/com/johnsnowlabs/ml/ai/RoBerta.scala index b5d0f7c8c51560..1e903ff0d4a345 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/RoBerta.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/RoBerta.scala @@ -16,10 +16,12 @@ package com.johnsnowlabs.ml.ai +import ai.onnxruntime.OnnxTensor import com.johnsnowlabs.ml.ai.util.PrepareEmbeddings +import com.johnsnowlabs.ml.onnx.OnnxWrapper import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} -import com.johnsnowlabs.ml.util.ModelArch +import com.johnsnowlabs.ml.util.{ModelArch, ONNX, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.{Annotation, AnnotatorType} @@ -39,7 +41,8 @@ import scala.collection.JavaConverters._ * Model's inputs and output(s) signatures */ private[johnsnowlabs] class RoBerta( - val tensorflowWrapper: TensorflowWrapper, + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], sentenceStartTokenId: Int, sentenceEndTokenId: Int, padTokenId: Int, @@ -50,6 +53,10 @@ private[johnsnowlabs] class RoBerta( val _tfRoBertaSignatures: Map[String, String] = signatures.getOrElse(ModelSignatureManager.apply()) + val detectedEngine: String = + if (tensorflowWrapper.isDefined) TensorFlow.name + else if (onnxWrapper.isDefined) ONNX.name + else TensorFlow.name private def sessionWarmup(): Unit = { val dummyInput = @@ -68,42 +75,81 @@ private[johnsnowlabs] class RoBerta( val maxSentenceLength = batch.map(pieceIds => pieceIds.length).max val batchLength = batch.length - val tensors = new TensorResources() - - val (tokenTensors, maskTensors) = - PrepareEmbeddings.prepareBatchTensors( - tensors = tensors, - batch = batch, - maxSentenceLength = maxSentenceLength, - batchLength = batchLength, - sentencePadTokenId = padTokenId) - - val runner = tensorflowWrapper - .getTFSessionWithSignature( - configProtoBytes = configProtoBytes, - savedSignatures = signatures, - initAllTables = false) - .runner - - runner - .feed( - _tfRoBertaSignatures - .getOrElse(ModelSignatureConstants.InputIds.key, "missing_input_id_key"), - tokenTensors) - .feed( - _tfRoBertaSignatures - .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), - maskTensors) - .fetch(_tfRoBertaSignatures - .getOrElse(ModelSignatureConstants.LastHiddenState.key, "missing_sequence_output_key")) - - val outs = runner.run().asScala - val embeddings = TensorResources.extractFloats(outs.head) - - tokenTensors.close() - maskTensors.close() - tensors.clearSession(outs) - tensors.clearTensors() + val embeddings = detectedEngine match { + + case ONNX.name => + // [nb of encoded sentences , maxSentenceLength] + val (runner, env) = onnxWrapper.get.getSession() + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val inputs = + Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava + + // TODO: A try without a catch or finally is equivalent to putting its body in a block; no exceptions are handled. + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get("last_hidden_state") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() + embeddings + + } finally if (results != null) results.close() + } + case _ => + val tensors = new TensorResources() + + val (tokenTensors, maskTensors) = + PrepareEmbeddings.prepareBatchTensors( + tensors = tensors, + batch = batch, + maxSentenceLength = maxSentenceLength, + batchLength = batchLength, + sentencePadTokenId = padTokenId) + + val runner = tensorflowWrapper.get + .getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + .runner + + runner + .feed( + _tfRoBertaSignatures + .getOrElse(ModelSignatureConstants.InputIds.key, "missing_input_id_key"), + tokenTensors) + .feed( + _tfRoBertaSignatures + .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), + maskTensors) + .fetch( + _tfRoBertaSignatures + .getOrElse( + ModelSignatureConstants.LastHiddenState.key, + "missing_sequence_output_key")) + + val outs = runner.run().asScala + val embeddings = TensorResources.extractFloats(outs.head) + + tokenTensors.close() + maskTensors.close() + tensors.clearSession(outs) + tensors.clearTensors() + + embeddings + } PrepareEmbeddings.prepareBatchWordEmbeddings( batch, @@ -133,7 +179,7 @@ private[johnsnowlabs] class RoBerta( batchLength = batchLength, sentencePadTokenId = padTokenId) - val runner = tensorflowWrapper + val runner = tensorflowWrapper.get .getTFSessionWithSignature( configProtoBytes = configProtoBytes, savedSignatures = signatures, diff --git a/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxSerializeModel.scala b/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxSerializeModel.scala new file mode 100644 index 00000000000000..b6acafabbb8a3a --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxSerializeModel.scala @@ -0,0 +1,98 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.ml.onnx + +import ai.onnxruntime.OrtSession.SessionOptions +import com.johnsnowlabs.util.FileHelper +import org.apache.commons.io.FileUtils +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.SparkSession + +import java.io.File +import java.nio.file.{Files, Paths} +import java.util.UUID + +trait WriteOnnxModel { + + def writeOnnxModel( + path: String, + spark: SparkSession, + onnxWrapper: OnnxWrapper, + suffix: String, + fileName: String): Unit = { + val uri = new java.net.URI(path.replaceAllLiterally("\\", "/")) + val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration) + + // 1. Create tmp folder + val tmpFolder = Files + .createTempDirectory(UUID.randomUUID().toString.takeRight(12) + suffix) + .toAbsolutePath + .toString + + val onnxFile = Paths.get(tmpFolder, fileName).toString + + // 2. Save Tensorflow state + onnxWrapper.saveToFile(onnxFile) + + // 3. Copy to dest folder + fs.copyFromLocalFile(new Path(onnxFile), new Path(path)) + + // 4. Remove tmp folder + FileUtils.deleteDirectory(new File(tmpFolder)) + } + +} + +trait ReadOnnxModel { + val onnxFile: String + + def readOnnxModel( + path: String, + spark: SparkSession, + suffix: String, + zipped: Boolean = true, + useBundle: Boolean = false, + sessionOptions: Option[SessionOptions] = None): OnnxWrapper = { + + val uri = new java.net.URI(path.replaceAllLiterally("\\", "/")) + val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration) + + // 1. Create tmp directory + val tmpFolder = Files + .createTempDirectory(UUID.randomUUID().toString.takeRight(12) + suffix) + .toAbsolutePath + .toString + + // 2. Copy to local dir + fs.copyToLocalFile(new Path(path, onnxFile), new Path(tmpFolder)) + + val localPath = new Path(tmpFolder, onnxFile).toString + + // 3. Read ONNX state + val onnxWrapper = OnnxWrapper.read( + localPath, + zipped = zipped, + useBundle = useBundle, + sessionOptions = sessionOptions) + + // 4. Remove tmp folder + FileHelper.delete(tmpFolder) + + onnxWrapper + } + +} diff --git a/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxWrapper.scala b/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxWrapper.scala new file mode 100644 index 00000000000000..3e755615521afe --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxWrapper.scala @@ -0,0 +1,162 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.ml.onnx + +import ai.onnxruntime.OrtSession.SessionOptions +import ai.onnxruntime.OrtSession.SessionOptions.{ExecutionMode, OptLevel} +import ai.onnxruntime.providers.OrtCUDAProviderOptions +import ai.onnxruntime.{OrtEnvironment, OrtSession} +import com.johnsnowlabs.util.{FileHelper, ZipArchiveUtil} +import org.apache.commons.io.FileUtils +import org.slf4j.{Logger, LoggerFactory} + +import java.io._ +import java.nio.file.{Files, Paths} +import java.util.UUID + +class OnnxWrapper(var onnxModel: Array[Byte]) extends Serializable { + + /** For Deserialization */ + def this() = { + this(null) + } + + // Important for serialization on none-kyro serializers + @transient private var m_session: OrtSession = _ + @transient private var m_env: OrtEnvironment = _ + @transient private val logger = LoggerFactory.getLogger("OnnxWrapper") + + def getSession(sessionOptions: Option[SessionOptions] = None): (OrtSession, OrtEnvironment) = + this.synchronized { + if (m_session == null && m_env == null) { + val (session, env) = OnnxWrapper.withSafeOnnxModelLoader(onnxModel, sessionOptions) + m_env = env + m_session = session + } + (m_session, m_env) + } + + def saveToFile(file: String): Unit = { + // 1. Create tmp director + val tmpFolder = Files + .createTempDirectory(UUID.randomUUID().toString.takeRight(12) + "_onnx") + .toAbsolutePath + .toString + + // 2. Save onnx model + val onnxFile = Paths.get(tmpFolder, file).toString + FileUtils.writeByteArrayToFile(new File(onnxFile), onnxModel) + + // 4. Zip folder + ZipArchiveUtil.zip(tmpFolder, file) + + // 5. Remove tmp directory + FileHelper.delete(tmpFolder) + } + +} + +/** Companion object */ +object OnnxWrapper { + private[OnnxWrapper] val logger: Logger = LoggerFactory.getLogger("OnnxWrapper") + + // TODO: make sure this.synchronized is needed or it's not a bottleneck + private def withSafeOnnxModelLoader( + onnxModel: Array[Byte], + sessionOptions: Option[SessionOptions] = None): (OrtSession, OrtEnvironment) = + this.synchronized { + val env = OrtEnvironment.getEnvironment() + + val opts = + if (sessionOptions.isDefined) sessionOptions.get else new OrtSession.SessionOptions() + + val providers = OrtEnvironment.getAvailableProviders + + if (providers.toArray.map(x => x.toString).contains("CUDA")) { + logger.info("using CUDA") + // it seems there is no easy way to use multiple GPUs + // at least not without using multiple threads + // TODO: add support for multiple GPUs + // TODO: allow user to specify which GPU to use + val gpuDeviceId = 0 // The GPU device ID to execute on + val cudaOpts = new OrtCUDAProviderOptions(gpuDeviceId) + // TODO: incorporate other cuda-related configs + // cudaOpts.add("gpu_mem_limit", "" + (512 * 1024 * 1024)) + // sessOptions.addCUDA(gpuDeviceId) + opts.addCUDA(cudaOpts) + } else { + logger.info("using CPUs") + // TODO: the following configs can be tested for performance + // However, so far, they seem to be slower than the ones used + // opts.setIntraOpNumThreads(Runtime.getRuntime.availableProcessors()) + // opts.setMemoryPatternOptimization(true) + // opts.setCPUArenaAllocator(false) + opts.setIntraOpNumThreads(6) + opts.setOptimizationLevel(OptLevel.ALL_OPT) + opts.setExecutionMode(ExecutionMode.SEQUENTIAL) + } + + val session = env.createSession(onnxModel, opts) + (session, env) + } + + def read( + modelPath: String, + zipped: Boolean = true, + useBundle: Boolean = false, + modelName: String = "model", + sessionOptions: Option[SessionOptions] = None): OnnxWrapper = { + + // 1. Create tmp folder + val tmpFolder = Files + .createTempDirectory(UUID.randomUUID().toString.takeRight(12) + "_onnx") + .toAbsolutePath + .toString + + // 2. Unpack archive + val folder = + if (zipped) + ZipArchiveUtil.unzip(new File(modelPath), Some(tmpFolder)) + else + modelPath + + // TODO: simplify this logic of useBundle + val (session, env, modelBytes) = + if (useBundle) { + val onnxFile = Paths.get(modelPath, s"$modelName.onnx").toString + val modelFile = new File(onnxFile) + val modelBytes = FileUtils.readFileToByteArray(modelFile) + val (session, env) = withSafeOnnxModelLoader(modelBytes, sessionOptions) + (session, env, modelBytes) + } else { + val modelFile = new File(folder).list().head + val fullPath = Paths.get(folder, modelFile).toFile + val modelBytes = FileUtils.readFileToByteArray(fullPath) + val (session, env) = withSafeOnnxModelLoader(modelBytes, sessionOptions) + (session, env, modelBytes) + } + + // 4. Remove tmp folder + FileHelper.delete(tmpFolder) + + val onnxWrapper = new OnnxWrapper(modelBytes) + onnxWrapper.m_session = session + onnxWrapper.m_env = env + onnxWrapper + } + +} diff --git a/src/main/scala/com/johnsnowlabs/ml/util/LoadExternalModel.scala b/src/main/scala/com/johnsnowlabs/ml/util/LoadExternalModel.scala index 8e1f737be3a512..58aff6825f0408 100644 --- a/src/main/scala/com/johnsnowlabs/ml/util/LoadExternalModel.scala +++ b/src/main/scala/com/johnsnowlabs/ml/util/LoadExternalModel.scala @@ -37,12 +37,26 @@ object LoadExternalModel { | ├── variables.data-00000-of-00001 | └── variables.index | + |A typical imported ONNX model has the following structure: + | + |├── assets/ + | ├── your-assets-are-here (vocab, sp model, labels, etc.) + |├── model.onnx + | + |A typical imported ONNX model for Seq2Seq has the following structure: + | + |├── assets/ + | ├── your-assets-are-here (vocab, sp model, labels, etc.) + |├── encoder_model.onnx + |├── decoder_model.onnx + |├── decoder_with_past_model.onnx (not used in this release) + | |Please make sure you follow provided notebooks to import external models into Spark NLP: |https://github.com/JohnSnowLabs/spark-nlp/discussions/5669""".stripMargin } def isTensorFlowModel(modelPath: String): Boolean = { - val tfSavedModel = new File(modelPath, ModelEngine.tensorflowModelName) + val tfSavedModel = new File(modelPath, TensorFlow.modelName) tfSavedModel.exists() } @@ -50,11 +64,11 @@ object LoadExternalModel { def isOnnxModel(modelPath: String, isEncoderDecoder: Boolean = false): Boolean = { if (isEncoderDecoder) { - val onnxEncoderModel = new File(modelPath, ModelEngine.onnxEncoderModel) - val onnxDecoderModel = new File(modelPath, ModelEngine.onnxDecoderModel) + val onnxEncoderModel = new File(modelPath, ONNX.encoderModel) + val onnxDecoderModel = new File(modelPath, ONNX.decoderModel) onnxEncoderModel.exists() && onnxDecoderModel.exists() } else { - val onnxModel = new File(modelPath, ModelEngine.onnxModelName) + val onnxModel = new File(modelPath, ONNX.modelName) onnxModel.exists() } @@ -80,12 +94,12 @@ object LoadExternalModel { val onnxModelExist = isOnnxModel(modelPath, isEncoderDecoder) if (tfSavedModelExist) { - ModelEngine.tensorflow + TensorFlow.name } else if (onnxModelExist) { - ModelEngine.onnx + ONNX.name } else { require(tfSavedModelExist || onnxModelExist, notSupportedEngineError) - ModelEngine.unk + Unknown.name } } diff --git a/src/main/scala/com/johnsnowlabs/ml/util/ModelEngine.scala b/src/main/scala/com/johnsnowlabs/ml/util/ModelEngine.scala index 9e8b93e9991219..061a42e7caa930 100644 --- a/src/main/scala/com/johnsnowlabs/ml/util/ModelEngine.scala +++ b/src/main/scala/com/johnsnowlabs/ml/util/ModelEngine.scala @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 John Snow Labs + * Copyright 2017-2023 John Snow Labs * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,13 +16,24 @@ package com.johnsnowlabs.ml.util -object ModelEngine { - val tensorflow = "tensorflow" - val tensorflowModelName = "saved_model.pb" - val onnx = "onnx" - val onnxModelName = "model.onnx" - val onnxEncoderModel = "encoder_model.onnx" - val onnxDecoderModel = "decoder_model.onnx" - val onnxDecoderWithPastModel = "decoder_with_past_model.onnx" - val unk = "unk" +sealed trait ModelEngine + +final case object TensorFlow extends ModelEngine { + val name = "tensorflow" + val modelName = "saved_model.pb" +} +final case object PyTorch extends ModelEngine { + val name = "pytorch" +} + +final case object ONNX extends ModelEngine { + val name = "onnx" + val modelName = "model.onnx" + val encoderModel = "encoder_model.onnx" + val decoderModel = "decoder_model.onnx" + val decoderWithPastModel = "decoder_with_past_model.onnx" +} + +final case object Unknown extends ModelEngine { + val name = "unk" } diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasEngine.scala b/src/main/scala/com/johnsnowlabs/nlp/HasEngine.scala index 541d50b34afee5..39870b3073ce12 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/HasEngine.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/HasEngine.scala @@ -16,7 +16,7 @@ package com.johnsnowlabs.nlp -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import org.apache.spark.ml.param.Param trait HasEngine extends ParamsAndFeaturesWritable { @@ -27,7 +27,7 @@ trait HasEngine extends ParamsAndFeaturesWritable { */ val engine = new Param[String](this, "engine", "Deep Learning engine used for this model") - setDefault(engine, ModelEngine.tensorflow) + setDefault(engine, TensorFlow.name) /** @group getParam */ def getEngine: String = $(engine) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/audio/HubertForCTC.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/audio/HubertForCTC.scala index 989b3ee0634d30..520ffd0bda69ab 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/audio/HubertForCTC.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/audio/HubertForCTC.scala @@ -22,7 +22,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.audio.feature_extractor.Preprocessor import org.apache.spark.ml.util.Identifiable @@ -213,7 +213,7 @@ trait ReadHubertForAudioDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/audio/Wav2Vec2ForCTC.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/audio/Wav2Vec2ForCTC.scala index 4e51a3812f1a25..5927de36c587e9 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/audio/Wav2Vec2ForCTC.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/audio/Wav2Vec2ForCTC.scala @@ -27,7 +27,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp.AnnotatorType.{AUDIO, DOCUMENT} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.audio.feature_extractor.Preprocessor @@ -340,7 +340,7 @@ trait ReadWav2Vec2ForAudioDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForQuestionAnswering.scala index 1d2026e7f1bd3a..217fbc6ca25947 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForQuestionAnswering.scala @@ -28,7 +28,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast @@ -317,7 +317,7 @@ trait ReadAlbertForQuestionAnsweringDLModel annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForSequenceClassification.scala index 8e110c8460ec5a..f0d61bcaade650 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForSequenceClassification.scala @@ -29,7 +29,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -372,7 +372,7 @@ trait ReadAlbertForSequenceDLModel extends ReadTensorflowModel with ReadSentence annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForTokenClassification.scala index 4abbb18a6307f1..89e61223d63097 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForTokenClassification.scala @@ -29,7 +29,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.{ModelEngine, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -343,7 +343,7 @@ trait ReadAlbertForTokenDLModel extends ReadTensorflowModel with ReadSentencePie annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForQuestionAnswering.scala index e8d17348c1b968..d48b40dcb65c08 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForQuestionAnswering.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.{ModelEngine, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast @@ -325,7 +325,7 @@ trait ReadBertForQuestionAnsweringDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForSequenceClassification.scala index d873915c1e412e..ff0bb3aeb4676a 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForSequenceClassification.scala @@ -24,7 +24,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.{ModelEngine, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -383,7 +383,7 @@ trait ReadBertForSequenceDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForTokenClassification.scala index c9062cd3d99b83..0c287de7d2cd64 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForTokenClassification.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -346,7 +346,7 @@ trait ReadBertForTokenDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForZeroShotClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForZeroShotClassification.scala index b0149ab660d9c8..6c6ddc35140d1a 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForZeroShotClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForZeroShotClassification.scala @@ -19,23 +19,19 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.BertClassification import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.util.LoadExternalModel.{ - loadSentencePieceAsset, loadTextAsset, modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature -import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper} import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.param.{BooleanParam, IntArrayParam, IntParam} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.SparkSession -import java.io.File - /** BertForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI (natural * language inference) tasks. Equivalent of `BertForSequenceClassification` models, but these * models don't require a hardcoded number of potential classes, they can be chosen at runtime. @@ -421,7 +417,7 @@ trait ReadBertForZeroShotDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala index 784003488a1a83..e55e6adf4b6cb5 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala @@ -28,7 +28,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast @@ -323,7 +323,7 @@ trait ReadCamemBertForQADLModel extends ReadTensorflowModel with ReadSentencePie annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala index d96d8e59318e1e..9519af01f8a7ac 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala @@ -29,7 +29,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -378,7 +378,7 @@ trait ReadCamemBertForSequenceDLModel extends ReadTensorflowModel with ReadSente annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala index 7b440341739223..275cd4bba61238 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala @@ -29,7 +29,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -345,7 +345,7 @@ trait ReadCamemBertForTokenDLModel extends ReadTensorflowModel with ReadSentence annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForQuestionAnswering.scala index 06e9c955d1f0f6..600b85da999a6d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForQuestionAnswering.scala @@ -28,7 +28,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast @@ -323,7 +323,7 @@ trait ReadDeBertaForQuestionAnsweringDLModel annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForSequenceClassification.scala index dae903e43e14df..0f025ebca7c367 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForSequenceClassification.scala @@ -29,7 +29,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -374,7 +374,7 @@ trait ReadDeBertaForSequenceDLModel extends ReadTensorflowModel with ReadSentenc annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForTokenClassification.scala index b09d9d5298bca9..81b3fdff7def4b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForTokenClassification.scala @@ -29,7 +29,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -343,7 +343,7 @@ trait ReadDeBertaForTokenDLModel extends ReadTensorflowModel with ReadSentencePi annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForQuestionAnswering.scala index e950099b9e82fc..be3709d19b6279 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForQuestionAnswering.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast @@ -328,7 +328,7 @@ trait ReadDistilBertForQuestionAnsweringDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForSequenceClassification.scala index 4c2699cf848e28..aee25f66d01640 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForSequenceClassification.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -383,7 +383,7 @@ trait ReadDistilBertForSequenceDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForTokenClassification.scala index 53690d311104e2..20616a8303e7fc 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForTokenClassification.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -351,7 +351,7 @@ trait ReadDistilBertForTokenDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForZeroShotClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForZeroShotClassification.scala index 27f34509c867aa..b1afba431726d2 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForZeroShotClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForZeroShotClassification.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -423,7 +423,7 @@ trait ReadDistilBertForZeroShotDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForQuestionAnswering.scala index f9cdbeaf323127..453b8ac7e2cb17 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForQuestionAnswering.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast @@ -349,7 +349,7 @@ trait ReadLongformerForQuestionAnsweringDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForSequenceClassification.scala index e6c91330eaf371..6dd293f033515c 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForSequenceClassification.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -403,7 +403,7 @@ trait ReadLongformerForSequenceDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForTokenClassification.scala index 1957ccb20b00cb..176fea3d1e19f2 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForTokenClassification.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -371,7 +371,7 @@ trait ReadLongformerForTokenDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnswering.scala index 27212384881b1b..35bd006fc4a3e5 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnswering.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.{ModelEngine, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast @@ -347,7 +347,7 @@ trait ReadRoBertaForQuestionAnsweringDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForSequenceClassification.scala index f3c76c0b88f915..5e4b268af48f0d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForSequenceClassification.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -398,7 +398,7 @@ trait ReadRoBertaForSequenceDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForTokenClassification.scala index 65c14a953c5b05..742306621bd376 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForTokenClassification.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -369,7 +369,7 @@ trait ReadRoBertaForTokenDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification.scala index ff24acd94a7894..60041627854e15 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -441,7 +441,7 @@ trait ReadRoBertaForZeroShotDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/TapasForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/TapasForQuestionAnswering.scala index b9a1e253525054..22b3760cb8fa69 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/TapasForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/TapasForQuestionAnswering.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp.base.TableAssembler import com.johnsnowlabs.nlp.{Annotation, AnnotatorType, HasPretrained, ParamsAndFeaturesReadable} import org.apache.spark.broadcast.Broadcast @@ -265,7 +265,7 @@ trait ReadTapasForQuestionAnsweringDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForQuestionAnswering.scala index a42fef9c880aea..01920477d5a672 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForQuestionAnswering.scala @@ -28,7 +28,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast @@ -323,7 +323,7 @@ trait ReadXlmRoBertaForQuestionAnsweringDLModel annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForSequenceClassification.scala index eada6953d7bca1..add55d9270b8be 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForSequenceClassification.scala @@ -29,7 +29,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -375,7 +375,7 @@ trait ReadXlmRoBertaForSequenceDLModel extends ReadTensorflowModel with ReadSent annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForTokenClassification.scala index 38a379d9ae529a..ded252b097d481 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForTokenClassification.scala @@ -29,7 +29,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -345,7 +345,7 @@ trait ReadXlmRoBertaForTokenDLModel extends ReadTensorflowModel with ReadSentenc annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForSequenceClassification.scala index 593e9d51e37a6f..b9e786c4a869fb 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForSequenceClassification.scala @@ -29,7 +29,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -371,7 +371,7 @@ trait ReadXlnetForSequenceDLModel extends ReadTensorflowModel with ReadSentenceP annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForTokenClassification.scala index 3f9e9f54df57e8..43b1e4dcd46103 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForTokenClassification.scala @@ -29,7 +29,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -343,7 +343,7 @@ trait ReadXlnetForTokenDLModel extends ReadTensorflowModel with ReadSentencePiec annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/coref/SpanBertCorefModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/coref/SpanBertCorefModel.scala index 1b097a76813d03..bb48cbc9d7c575 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/coref/SpanBertCorefModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/coref/SpanBertCorefModel.scala @@ -26,7 +26,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder} @@ -449,7 +449,7 @@ trait ReadSpanBertCorefTensorflowModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/ConvNextForImageClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/ConvNextForImageClassification.scala index b87de63bae8e58..3af9710b12ff9b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/ConvNextForImageClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/ConvNextForImageClassification.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.cv.feature_extractor.Preprocessor import org.apache.spark.broadcast.Broadcast @@ -353,7 +353,7 @@ trait ReadConvNextForImageDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/SwinForImageClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/SwinForImageClassification.scala index 4341fa23cd4bd6..344200c0c2e501 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/SwinForImageClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/SwinForImageClassification.scala @@ -22,7 +22,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.cv.feature_extractor.Preprocessor import org.apache.spark.ml.param.{BooleanParam, DoubleParam} @@ -334,7 +334,7 @@ trait ReadSwinForImageDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/ViTForImageClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/ViTForImageClassification.scala index e786739b6ac718..985fbc041251a5 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/ViTForImageClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/ViTForImageClassification.scala @@ -27,7 +27,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp.AnnotatorType.{CATEGORY, IMAGE} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.cv.feature_extractor.Preprocessor @@ -39,8 +39,6 @@ import org.apache.spark.sql.SparkSession import org.json4s._ import org.json4s.jackson.JsonMethods._ -import java.io.File - /** Vision Transformer (ViT) for image classification. * * ViT is a transformer based alternative to the convolutional neural networks usually used for @@ -384,7 +382,7 @@ trait ReadViTForImageDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ld/dl/LanguageDetectorDL.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ld/dl/LanguageDetectorDL.scala index 79a14a4b6098e5..5e05d06d6c2bec 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ld/dl/LanguageDetectorDL.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ld/dl/LanguageDetectorDL.scala @@ -22,7 +22,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -341,7 +341,7 @@ trait ReadLanguageDetectorDLTensorflowModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, _) = TensorflowWrapper.read( localModelPath, diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala index 66d97181a86e38..aa72c0e4738fbe 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala @@ -27,7 +27,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -596,7 +596,7 @@ trait ReadBartTransformerDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read( localModelPath, diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala index 246c6e8a6f10ac..29d76fcd0dea17 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala @@ -27,7 +27,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.{BpeTokenizer, Gpt2Tokenizer} @@ -544,7 +544,7 @@ trait ReadGPT2TransformerDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, _) = TensorflowWrapper.read( localModelPath, diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala index e15729d3f05a42..ce18cf3ad4f8bd 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala @@ -29,7 +29,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast @@ -458,7 +458,7 @@ trait ReadMarianMTDLModel extends ReadTensorflowModel with ReadSentencePieceMode annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read( localModelPath, zipped = false, diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/T5Transformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/T5Transformer.scala index ac70e675f3df97..edc691a236191c 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/T5Transformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/T5Transformer.scala @@ -32,7 +32,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -537,7 +537,7 @@ trait ReadT5TransformerDLModel extends ReadTensorflowModel with ReadSentencePiec val spModel = loadSentencePieceAsset(localModelPath, "spiece.model") detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read( localModelPath, zipped = false, diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AlbertEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AlbertEmbeddings.scala index 1f4c0e8a2923b3..c8da89256f2b4c 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AlbertEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AlbertEmbeddings.scala @@ -28,7 +28,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -396,7 +396,7 @@ trait ReadAlbertDLModel extends ReadTensorflowModel with ReadSentencePieceModel annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala index fb47275a624d3e..3717c09bf3b9bc 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala @@ -17,13 +17,14 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.ai.Bert +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.util.LoadExternalModel.{ loadTextAsset, modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.{ModelEngine, ModelArch} +import com.johnsnowlabs.ml.util.{ModelArch, ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder} @@ -157,13 +158,30 @@ class BertEmbeddings(override val uid: String) extends AnnotatorModel[BertEmbeddings] with HasBatchedAnnotate[BertEmbeddings] with WriteTensorflowModel + with WriteOnnxModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine { + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ def this() = this(Identifiable.randomUID("BERT_EMBEDDINGS")) + /** Input Annotator Types: DOCUMENT, TOKEN + * + * @group anno + */ + override val inputAnnotatorTypes: Array[String] = + Array(AnnotatorType.DOCUMENT, AnnotatorType.TOKEN) + + /** Output Annotator Types: WORD_EMBEDDINGS + * + * @group anno + */ + override val outputAnnotatorType: AnnotatorType = AnnotatorType.WORD_EMBEDDINGS + /** @group setParam */ def sentenceStartTokenId: Int = { $$(vocabulary)("[CLS]") @@ -241,12 +259,14 @@ class BertEmbeddings(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper): BertEmbeddings = { + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper]): BertEmbeddings = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new Bert( tensorflowWrapper, + onnxWrapper, sentenceStartTokenId, sentenceEndTokenId, configProtoBytes = getConfigProtoBytes, @@ -257,7 +277,6 @@ class BertEmbeddings(override val uid: String) this } - /** @group getParam */ def getModelIfNotSet: Bert = _model.get.value /** Set Embeddings dimensions for the BERT model Only possible to set this when the first time @@ -354,22 +373,30 @@ class BertEmbeddings(override val uid: String) wrapEmbeddingsMetadata(dataset.col(getOutputCol), $(dimension), Some($(storageRef)))) } - /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator - * type - */ - override val inputAnnotatorTypes: Array[String] = - Array(AnnotatorType.DOCUMENT, AnnotatorType.TOKEN) - override val outputAnnotatorType: AnnotatorType = AnnotatorType.WORD_EMBEDDINGS - override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_bert", - BertEmbeddings.tfFile, - configProtoBytes = getConfigProtoBytes) + val suffix = "_bert" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + BertEmbeddings.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + BertEmbeddings.onnxFile) + + case _ => + throw new Exception(notSupportedEngineError) + } } } @@ -391,15 +418,27 @@ trait ReadablePretrainedBertModel super.pretrained(name, lang, remoteLoc) } -trait ReadBertDLModel extends ReadTensorflowModel { +trait ReadBertDLModel extends ReadTensorflowModel with ReadOnnxModel { this: ParamsAndFeaturesReadable[BertEmbeddings] => override val tfFile: String = "bert_tensorflow" + override val onnxFile: String = "bert_onnx" def readModel(instance: BertEmbeddings, path: String, spark: SparkSession): Unit = { - val tf = readTensorflowModel(path, spark, "_bert_tf", initAllTables = false) - instance.setModelIfNotSet(spark, tf) + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = readTensorflowModel(path, spark, "_bert_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tfWrapper), None) + + case ONNX.name => { + val onnxWrapper = + readOnnxModel(path, spark, "_bert_onnx", zipped = true, useBundle = false, None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper)) + } + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -417,8 +456,8 @@ trait ReadBertDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => - val (wrapper, signatures) = + case TensorFlow.name => + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -431,7 +470,12 @@ trait ReadBertDLModel extends ReadTensorflowModel { */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper) + .setModelIfNotSet(spark, Some(tfWrapper), None) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper)) case _ => throw new Exception(notSupportedEngineError) @@ -439,6 +483,7 @@ trait ReadBertDLModel extends ReadTensorflowModel { annotatorModel } + } /** This is the companion object of [[BertEmbeddings]]. Please refer to that class for the diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings.scala index 6e7af48bd113b0..c2e36695688a38 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings.scala @@ -17,13 +17,14 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.ai.Bert +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.util.LoadExternalModel.{ loadTextAsset, modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.{ModelEngine, ModelArch} +import com.johnsnowlabs.ml.util.{ModelArch, ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder} @@ -122,7 +123,7 @@ import org.apache.spark.sql.{DataFrame, SparkSession} * }}} * * @see - * [[BertEmbeddings]] for token-level embeddings + * [[BertSentenceEmbeddings]] for sentence-level embeddings * @see * [[com.johnsnowlabs.nlp.annotators.classifier.dl.BertForSequenceClassification BertForSequenceClassification]] * for embeddings with a sequence classification layer on top @@ -152,6 +153,7 @@ class BertSentenceEmbeddings(override val uid: String) extends AnnotatorModel[BertSentenceEmbeddings] with HasBatchedAnnotate[BertSentenceEmbeddings] with WriteTensorflowModel + with WriteOnnxModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties @@ -302,13 +304,17 @@ class BertSentenceEmbeddings(override val uid: String) def getModelIfNotSet: Bert = _model.get.value /** @group setParam */ - def setModelIfNotSet(spark: SparkSession, tensorflow: TensorflowWrapper): this.type = { + def setModelIfNotSet( + spark: SparkSession, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper]): this.type = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new Bert( - tensorflow, + tensorflowWrapper, + onnxWrapper, sentenceStartTokenId, sentenceEndTokenId, configProtoBytes = getConfigProtoBytes, @@ -391,13 +397,28 @@ class BertSentenceEmbeddings(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_bert_sentence", - BertSentenceEmbeddings.tfFile, - configProtoBytes = getConfigProtoBytes) + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + "_bert_sentence", + BertSentenceEmbeddings.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + "_bert_sentence", + BertSentenceEmbeddings.onnxFile) + + case _ => + throw new Exception(notSupportedEngineError) + } + } } @@ -419,15 +440,34 @@ trait ReadablePretrainedBertSentenceModel super.pretrained(name, lang, remoteLoc) } -trait ReadBertSentenceDLModel extends ReadTensorflowModel { +trait ReadBertSentenceDLModel extends ReadTensorflowModel with ReadOnnxModel { this: ParamsAndFeaturesReadable[BertSentenceEmbeddings] => override val tfFile: String = "bert_sentence_tensorflow" + override val onnxFile: String = "bert_sentence_onnx" def readModel(instance: BertSentenceEmbeddings, path: String, spark: SparkSession): Unit = { - val tf = readTensorflowModel(path, spark, "_bert_sentence_tf", initAllTables = false) - instance.setModelIfNotSet(spark, tf) + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = + readTensorflowModel(path, spark, "_bert_sentence_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tfWrapper), None) + + case ONNX.name => { + val onnxWrapper = + readOnnxModel( + path, + spark, + "_bert_sentence_onnx", + zipped = true, + useBundle = false, + None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper)) + } + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -445,8 +485,8 @@ trait ReadBertSentenceDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => - val (wrapper, signatures) = + case TensorFlow.name => + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -459,7 +499,12 @@ trait ReadBertSentenceDLModel extends ReadTensorflowModel { */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper) + .setModelIfNotSet(spark, Some(tfWrapper), None) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper)) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings.scala index 12c2b4d1edaef0..914d9b87b91449 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings.scala @@ -12,7 +12,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -362,7 +362,7 @@ trait ReadCamemBertDLModel extends ReadTensorflowModel with ReadSentencePieceMod annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/DeBertaEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/DeBertaEmbeddings.scala index e502484f1f1521..56f57238e3a84e 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/DeBertaEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/DeBertaEmbeddings.scala @@ -17,6 +17,7 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.ai.DeBerta +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ ReadSentencePieceModel, @@ -28,7 +29,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.{ModelEngine, ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -160,6 +161,7 @@ class DeBertaEmbeddings(override val uid: String) extends AnnotatorModel[DeBertaEmbeddings] with HasBatchedAnnotate[DeBertaEmbeddings] with WriteTensorflowModel + with WriteOnnxModel with WriteSentencePieceModel with HasEmbeddingsProperties with HasStorageRef @@ -247,7 +249,8 @@ class DeBertaEmbeddings(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): DeBertaEmbeddings = { if (_model.isEmpty) { @@ -255,6 +258,7 @@ class DeBertaEmbeddings(override val uid: String) spark.sparkContext.broadcast( new DeBerta( tensorflowWrapper, + onnxWrapper, spp, batchSize = $(batchSize), configProtoBytes = getConfigProtoBytes, @@ -308,30 +312,41 @@ class DeBertaEmbeddings(override val uid: String) }) } - override def onWrite(path: String, spark: SparkSession): Unit = { - super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_deberta", - DeBertaEmbeddings.tfFile, - configProtoBytes = getConfigProtoBytes) - writeSentencePieceModel( - path, - spark, - getModelIfNotSet.spp, - "_deberta", - DeBertaEmbeddings.sppFile) - - } - override protected def afterAnnotate(dataset: DataFrame): DataFrame = { dataset.withColumn( getOutputCol, wrapEmbeddingsMetadata(dataset.col(getOutputCol), $(dimension), Some($(storageRef)))) } + override def onWrite(path: String, spark: SparkSession): Unit = { + super.onWrite(path, spark) + val suffix = "_deberta" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + DeBertaEmbeddings.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + DeBertaEmbeddings.onnxFile) + + case _ => + throw new Exception(notSupportedEngineError) + } + + writeSentencePieceModel(path, spark, getModelIfNotSet.spp, suffix, DeBertaEmbeddings.sppFile) + + } + } trait ReadablePretrainedDeBertaModel @@ -351,16 +366,32 @@ trait ReadablePretrainedDeBertaModel super.pretrained(name, lang, remoteLoc) } -trait ReadDeBertaDLModel extends ReadTensorflowModel with ReadSentencePieceModel { +trait ReadDeBertaDLModel + extends ReadTensorflowModel + with ReadSentencePieceModel + with ReadOnnxModel { this: ParamsAndFeaturesReadable[DeBertaEmbeddings] => override val tfFile: String = "deberta_tensorflow" + override val onnxFile: String = "deberta_onnx" override val sppFile: String = "deberta_spp" def readModel(instance: DeBertaEmbeddings, path: String, spark: SparkSession): Unit = { - val tf = readTensorflowModel(path, spark, "_deberta_tf", initAllTables = false) val spp = readSentencePieceModel(path, spark, "_deberta_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = readTensorflowModel(path, spark, "_deberta_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + + case ONNX.name => { + val onnxWrapper = + readOnnxModel(path, spark, "_deberta_onnx", zipped = true, useBundle = false, None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + } + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -377,8 +408,8 @@ trait ReadDeBertaDLModel extends ReadTensorflowModel with ReadSentencePieceModel annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => - val (wrapper, signatures) = + case TensorFlow.name => + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -391,7 +422,12 @@ trait ReadDeBertaDLModel extends ReadTensorflowModel with ReadSentencePieceModel */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings.scala index 8bcfbd578a2343..d28ce903c48eb0 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings.scala @@ -17,13 +17,14 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.ai.DistilBert +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.util.LoadExternalModel.{ loadTextAsset, modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.{ModelEngine, ModelArch} +import com.johnsnowlabs.ml.util.{ModelArch, ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder} @@ -161,6 +162,7 @@ class DistilBertEmbeddings(override val uid: String) extends AnnotatorModel[DistilBertEmbeddings] with HasBatchedAnnotate[DistilBertEmbeddings] with WriteTensorflowModel + with WriteOnnxModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties @@ -171,6 +173,19 @@ class DistilBertEmbeddings(override val uid: String) */ def this() = this(Identifiable.randomUID("DISTILBERT_EMBEDDINGS")) + /** Input Annotator Types: DOCUMENT. TOKEN + * + * @group param + */ + override val inputAnnotatorTypes: Array[String] = + Array(AnnotatorType.DOCUMENT, AnnotatorType.TOKEN) + + /** Output Annotator Types: WORD_EMBEDDINGS + * + * @group param + */ + override val outputAnnotatorType: AnnotatorType = AnnotatorType.WORD_EMBEDDINGS + def sentenceStartTokenId: Int = { $$(vocabulary)("[CLS]") } @@ -246,12 +261,14 @@ class DistilBertEmbeddings(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper): DistilBertEmbeddings = { + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper]): DistilBertEmbeddings = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new DistilBert( tensorflowWrapper, + onnxWrapper, sentenceStartTokenId, sentenceEndTokenId, configProtoBytes = getConfigProtoBytes, @@ -357,28 +374,31 @@ class DistilBertEmbeddings(override val uid: String) wrapEmbeddingsMetadata(dataset.col(getOutputCol), $(dimension), Some($(storageRef)))) } - /** Input Annotator Types: DOCUMENT. TOKEN - * - * @group param - */ - override val inputAnnotatorTypes: Array[String] = - Array(AnnotatorType.DOCUMENT, AnnotatorType.TOKEN) - - /** Output Annotator Types: WORD_EMBEDDINGS - * - * @group param - */ - override val outputAnnotatorType: AnnotatorType = AnnotatorType.WORD_EMBEDDINGS - override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_distilbert", - DistilBertEmbeddings.tfFile, - configProtoBytes = getConfigProtoBytes) + val suffix = "_distilbert" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + DistilBertEmbeddings.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + DistilBertEmbeddings.onnxFile) + + case _ => + throw new Exception(notSupportedEngineError) + } + } } @@ -400,15 +420,27 @@ trait ReadablePretrainedDistilBertModel super.pretrained(name, lang, remoteLoc) } -trait ReadDistilBertDLModel extends ReadTensorflowModel { +trait ReadDistilBertDLModel extends ReadTensorflowModel with ReadOnnxModel { this: ParamsAndFeaturesReadable[DistilBertEmbeddings] => override val tfFile: String = "distilbert_tensorflow" + override val onnxFile: String = "bert_onnx" def readModel(instance: DistilBertEmbeddings, path: String, spark: SparkSession): Unit = { - val tf = readTensorflowModel(path, spark, "_distilbert_tf", initAllTables = false) - instance.setModelIfNotSet(spark, tf) + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = readTensorflowModel(path, spark, "_distilbert_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tfWrapper), None) + + case ONNX.name => { + val onnxWrapper = + readOnnxModel(path, spark, "_distilbert_onnx", zipped = true, useBundle = false, None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper)) + } + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -426,8 +458,8 @@ trait ReadDistilBertDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => - val (wrapper, signatures) = + case TensorFlow.name => + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -440,7 +472,12 @@ trait ReadDistilBertDLModel extends ReadTensorflowModel { */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper) + .setModelIfNotSet(spark, Some(tfWrapper), None) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper)) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ElmoEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ElmoEmbeddings.scala index 647061442198c1..7f12ffc4c89a4d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/ElmoEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/ElmoEmbeddings.scala @@ -19,7 +19,7 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.ai.Elmo import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.util.LoadExternalModel.{modelSanityCheck, notSupportedEngineError} -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.storage.HasStorageRef @@ -363,7 +363,7 @@ trait ReadElmoDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, _) = TensorflowWrapper.read( localModelPath, zipped = false, diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala index 7984096e2ca163..a42c1b334d9d0b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala @@ -17,13 +17,14 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.ai.RoBerta +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.util.LoadExternalModel.{ loadTextAsset, modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.{ModelEngine, ModelArch} +import com.johnsnowlabs.ml.util.{ModelArch, ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.BpeTokenizer @@ -247,12 +248,14 @@ class LongformerEmbeddings(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper): LongformerEmbeddings = { + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper]): LongformerEmbeddings = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new RoBerta( tensorflowWrapper, + onnxWrapper, sentenceStartTokenId, sentenceEndTokenId, padTokenId, @@ -381,7 +384,7 @@ class LongformerEmbeddings(override val uid: String) writeTensorflowModelV2( path, spark, - getModelIfNotSet.tensorflowWrapper, + getModelIfNotSet.tensorflowWrapper.get, "_longformer", LongformerEmbeddings.tfFile, configProtoBytes = getConfigProtoBytes) @@ -414,7 +417,7 @@ trait ReadLongformerDLModel extends ReadTensorflowModel { def readModel(instance: LongformerEmbeddings, path: String, spark: SparkSession): Unit = { val tf = readTensorflowModel(path, spark, "_longformer_tf", initAllTables = false) - instance.setModelIfNotSet(spark, tf) + instance.setModelIfNotSet(spark, Some(tf), None) } addReader(readModel) @@ -440,7 +443,7 @@ trait ReadLongformerDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) @@ -454,7 +457,7 @@ trait ReadLongformerDLModel extends ReadTensorflowModel { */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper) + .setModelIfNotSet(spark, Some(wrapper), None) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala index dae1369d440a2e..02c06bca1b4e77 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala @@ -17,13 +17,14 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.ai.RoBerta +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.util.LoadExternalModel.{ loadTextAsset, modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.{ModelEngine, ModelArch} +import com.johnsnowlabs.ml.util.{ModelArch, ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.BpeTokenizer @@ -162,6 +163,7 @@ class RoBertaEmbeddings(override val uid: String) extends AnnotatorModel[RoBertaEmbeddings] with HasBatchedAnnotate[RoBertaEmbeddings] with WriteTensorflowModel + with WriteOnnxModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties @@ -260,12 +262,14 @@ class RoBertaEmbeddings(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper): RoBertaEmbeddings = { + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper]): RoBertaEmbeddings = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new RoBerta( tensorflowWrapper, + onnxWrapper, sentenceStartTokenId, sentenceEndTokenId, padTokenId, @@ -391,15 +395,29 @@ class RoBertaEmbeddings(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_roberta", - RoBertaEmbeddings.tfFile, - configProtoBytes = getConfigProtoBytes) - } + val suffix = "_roberta" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + RoBertaEmbeddings.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + RoBertaEmbeddings.onnxFile) + case _ => + throw new Exception(notSupportedEngineError) + } + } } trait ReadablePretrainedRobertaModel @@ -419,15 +437,27 @@ trait ReadablePretrainedRobertaModel super.pretrained(name, lang, remoteLoc) } -trait ReadRobertaDLModel extends ReadTensorflowModel { +trait ReadRobertaDLModel extends ReadTensorflowModel with ReadOnnxModel { this: ParamsAndFeaturesReadable[RoBertaEmbeddings] => override val tfFile: String = "roberta_tensorflow" + override val onnxFile: String = "roberta_onnx" def readModel(instance: RoBertaEmbeddings, path: String, spark: SparkSession): Unit = { - val tf = readTensorflowModel(path, spark, "_roberta_tf", initAllTables = false) - instance.setModelIfNotSet(spark, tf) + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = readTensorflowModel(path, spark, "_roberta_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tfWrapper), None) + + case ONNX.name => { + val onnxWrapper = + readOnnxModel(path, spark, "_roberta_onnx", zipped = true, useBundle = false, None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper)) + } + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -453,7 +483,7 @@ trait ReadRobertaDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) @@ -467,7 +497,12 @@ trait ReadRobertaDLModel extends ReadTensorflowModel { */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper) + .setModelIfNotSet(spark, Some(wrapper), None) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper)) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala index 9cff4ea74fbab5..c41c6c91ab2da9 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala @@ -17,13 +17,14 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.ai.RoBerta +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.util.LoadExternalModel.{ loadTextAsset, modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.{ModelEngine, ModelArch} +import com.johnsnowlabs.ml.util.{ModelArch, ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.BpeTokenizer @@ -257,12 +258,15 @@ class RoBertaSentenceEmbeddings(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper): RoBertaSentenceEmbeddings = { + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper]): RoBertaSentenceEmbeddings = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( + new RoBerta( tensorflowWrapper, + onnxWrapper, sentenceStartTokenId, sentenceEndTokenId, padTokenId, @@ -368,7 +372,7 @@ class RoBertaSentenceEmbeddings(override val uid: String) writeTensorflowModelV2( path, spark, - getModelIfNotSet.tensorflowWrapper, + getModelIfNotSet.tensorflowWrapper.get, "_roberta", RoBertaSentenceEmbeddings.tfFile, configProtoBytes = getConfigProtoBytes) @@ -403,7 +407,7 @@ trait ReadRobertaSentenceDLModel extends ReadTensorflowModel { def readModel(instance: RoBertaSentenceEmbeddings, path: String, spark: SparkSession): Unit = { val tf = readTensorflowModel(path, spark, "_roberta_tf", initAllTables = false) - instance.setModelIfNotSet(spark, tf) + instance.setModelIfNotSet(spark, Some(tf), None) } addReader(readModel) @@ -429,7 +433,7 @@ trait ReadRobertaSentenceDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) @@ -443,7 +447,7 @@ trait ReadRobertaSentenceDLModel extends ReadTensorflowModel { */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper) + .setModelIfNotSet(spark, Some(wrapper), None) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/UniversalSentenceEncoder.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/UniversalSentenceEncoder.scala index 2cc0712a615e1d..89bce984f4c388 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/UniversalSentenceEncoder.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/UniversalSentenceEncoder.scala @@ -23,7 +23,7 @@ import com.johnsnowlabs.ml.tensorflow.{ WriteTensorflowModel } import com.johnsnowlabs.ml.util.LoadExternalModel.{modelSanityCheck, notSupportedEngineError} -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, SENTENCE_EMBEDDINGS} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common.SentenceSplit @@ -349,7 +349,7 @@ trait ReadUSEDLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val wrapper = TensorflowWrapper.readWithSP( localModelPath, diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.scala index 76cbc656235e4e..107da32535a946 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.scala @@ -28,7 +28,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.{ModelEngine, ModelArch} +import com.johnsnowlabs.ml.util.{ModelArch, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -394,7 +394,7 @@ trait ReadXlmRobertaDLModel extends ReadTensorflowModel with ReadSentencePieceMo annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.scala index f81836a86e1a75..07df2844768290 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.scala @@ -28,7 +28,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.{ModelEngine, ModelArch} +import com.johnsnowlabs.ml.util.{ModelArch, ModelEngine, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -383,7 +383,7 @@ trait ReadXlmRobertaSentenceDLModel extends ReadTensorflowModel with ReadSentenc annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings.scala index a5bfac1d55159b..86d2c8b3e1cf97 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings.scala @@ -28,7 +28,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.{ModelEngine, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -393,7 +393,7 @@ trait ReadXlnetDLModel extends ReadTensorflowModel with ReadSentencePieceModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) From d8174cc11a7a9b826a504feff07e52ca079146f3 Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Sat, 1 Jul 2023 23:17:09 +1000 Subject: [PATCH 05/13] SPARKNLP 852 - Introducing "E5 Embeddings" for sentence embeddings like e5-large-v2 model (#13859) * Added E5 model * changed test type --------- Co-authored-by: Maziyar Panahi --- .../sparknlp/annotator/embeddings/__init__.py | 1 + .../annotator/embeddings/e5_embeddings.py | 191 ++++++++ python/sparknlp/internal/__init__.py | 5 + .../embeddings/e5_embeddings_test.py | 56 +++ .../scala/com/johnsnowlabs/ml/ai/E5.scala | 174 +++++++ .../nlp/embeddings/E5Embeddings.scala | 447 ++++++++++++++++++ .../nlp/embeddings/E5EmbeddingsTestSpec.scala | 57 +++ 7 files changed, 931 insertions(+) create mode 100644 python/sparknlp/annotator/embeddings/e5_embeddings.py create mode 100644 python/test/annotator/embeddings/e5_embeddings_test.py create mode 100644 src/main/scala/com/johnsnowlabs/ml/ai/E5.scala create mode 100644 src/main/scala/com/johnsnowlabs/nlp/embeddings/E5Embeddings.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/embeddings/E5EmbeddingsTestSpec.scala diff --git a/python/sparknlp/annotator/embeddings/__init__.py b/python/sparknlp/annotator/embeddings/__init__.py index 02ee80f98fa264..2b9a87a7dbf40e 100644 --- a/python/sparknlp/annotator/embeddings/__init__.py +++ b/python/sparknlp/annotator/embeddings/__init__.py @@ -22,6 +22,7 @@ from sparknlp.annotator.embeddings.distil_bert_embeddings import * from sparknlp.annotator.embeddings.doc2vec import * from sparknlp.annotator.embeddings.elmo_embeddings import * +from sparknlp.annotator.embeddings.e5_embeddings import * from sparknlp.annotator.embeddings.instructor_embeddings import * from sparknlp.annotator.embeddings.longformer_embeddings import * from sparknlp.annotator.embeddings.roberta_embeddings import * diff --git a/python/sparknlp/annotator/embeddings/e5_embeddings.py b/python/sparknlp/annotator/embeddings/e5_embeddings.py new file mode 100644 index 00000000000000..ee372290b1a333 --- /dev/null +++ b/python/sparknlp/annotator/embeddings/e5_embeddings.py @@ -0,0 +1,191 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes for E5Embeddings.""" + +from sparknlp.common import * + + +class E5Embeddings(AnnotatorModel, + HasEmbeddingsProperties, + HasCaseSensitiveProperties, + HasStorageRef, + HasBatchedAnnotate, + HasMaxSentenceLengthLimit): + """Sentence embeddings using E5. + + E5, a weakly supervised text embedding model that can generate text embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) + Pretrained models can be loaded with :meth:`.pretrained` of the companion + object: + + >>> embeddings = E5Embeddings.pretrained() \\ + ... .setInputCols(["document"]) \\ + ... .setOutputCol("e5_embeddings") + + + The default model is ``"e5_small"``, if no name is provided. + + For available pretrained models please see the + `Models Hub `__. + + + ====================== ====================== + Input Annotation types Output Annotation type + ====================== ====================== + ``DOCUMENT`` ``SENTENCE_EMBEDDINGS`` + ====================== ====================== + + Parameters + ---------- + batchSize + Size of every batch , by default 8 + dimension + Number of embedding dimensions, by default 768 + caseSensitive + Whether to ignore case in tokens for embeddings matching, by default False + maxSentenceLength + Max sentence length to process, by default 512 + configProtoBytes + ConfigProto from tensorflow, serialized into byte array. + + References + ---------- + `Text Embeddings by Weakly-Supervised Contrastive Pre-training `__ + + https://github.com/microsoft/unilm/tree/master/e5 + + **Paper abstract** + + *This paper presents E5, a family of state-of-the-art text embeddings that transfer + well to a wide range of tasks. The model is trained in a contrastive manner with + weak supervision signals from our curated large-scale text pair dataset (called + CCPairs). E5 can be readily used as a general-purpose embedding model for any + tasks requiring a single-vector representation of texts such as retrieval, clustering, + and classification, achieving strong performance in both zero-shot and fine-tuned + settings. We conduct extensive evaluations on 56 datasets from the BEIR and + MTEB benchmarks. For zero-shot settings, E5 is the first model that outperforms + the strong BM25 baseline on the BEIR retrieval benchmark without using any + labeled data. When fine-tuned, E5 obtains the best results on the MTEB benchmark, + beating existing embedding models with 40× more parameters.* + + Examples + -------- + >>> import sparknlp + >>> from sparknlp.base import * + >>> from sparknlp.annotator import * + >>> from pyspark.ml import Pipeline + >>> documentAssembler = DocumentAssembler() \\ + ... .setInputCol("text") \\ + ... .setOutputCol("document") + >>> embeddings = E5Embeddings.pretrained() \\ + ... .setInputCols(["document"]) \\ + ... .setOutputCol("e5_embeddings") + >>> embeddingsFinisher = EmbeddingsFinisher() \\ + ... .setInputCols(["e5_embeddings"]) \\ + ... .setOutputCols("finished_embeddings") \\ + ... .setOutputAsVector(True) + >>> pipeline = Pipeline().setStages([ + ... documentAssembler, + ... embeddings, + ... embeddingsFinisher + ... ]) + >>> data = spark.createDataFrame([["query: how much protein should a female eat", + ... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \ + ... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \ + ... "marathon. Check out the chart below to see how much protein you should be eating each day.", + ... ]]).toDF("text") + >>> result = pipeline.fit(data).transform(data) + >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80) + +--------------------------------------------------------------------------------+ + | result| + +--------------------------------------------------------------------------------+ + |[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...| + |[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...| + +--------------------------------------------------------------------------------+ + """ + + name = "E5Embeddings" + + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + + outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS + configProtoBytes = Param(Params._dummy(), + "configProtoBytes", + "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", + TypeConverters.toListInt) + + + def setConfigProtoBytes(self, b): + """Sets configProto from tensorflow, serialized into byte array. + + Parameters + ---------- + b : List[int] + ConfigProto from tensorflow, serialized into byte array + """ + return self._set(configProtoBytes=b) + + @keyword_only + def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.E5Embeddings", java_model=None): + super(E5Embeddings, self).__init__( + classname=classname, + java_model=java_model + ) + self._setDefault( + dimension=768, + batchSize=8, + maxSentenceLength=512, + caseSensitive=False, + ) + + @staticmethod + def loadSavedModel(folder, spark_session): + """Loads a locally saved model. + + Parameters + ---------- + folder : str + Folder of the saved model + spark_session : pyspark.sql.SparkSession + The current SparkSession + + Returns + ------- + E5Embeddings + The restored model + """ + from sparknlp.internal import _E5Loader + jModel = _E5Loader(folder, spark_session._jsparkSession)._java_obj + return E5Embeddings(java_model=jModel) + + @staticmethod + def pretrained(name="e5_small", lang="en", remote_loc=None): + """Downloads and loads a pretrained model. + + Parameters + ---------- + name : str, optional + Name of the pretrained model, by default "e5_small" + lang : str, optional + Language of the pretrained model, by default "en" + remote_loc : str, optional + Optional remote address of the resource, by default None. Will use + Spark NLPs repositories otherwise. + + Returns + ------- + E5Embeddings + The restored model + """ + from sparknlp.pretrained import ResourceDownloader + return ResourceDownloader.downloadModel(E5Embeddings, name, lang, remote_loc) diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py index 9131d37788b9e4..3de0d91a830188 100644 --- a/python/sparknlp/internal/__init__.py +++ b/python/sparknlp/internal/__init__.py @@ -143,6 +143,11 @@ def __init__(self, path, jspark): super(_ElmoLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.ElmoEmbeddings.loadSavedModel", path, jspark) +class _E5Loader(ExtendedJavaWrapper): + def __init__(self, path, jspark): + super(_E5Loader, self).__init__("com.johnsnowlabs.nlp.embeddings.E5Embeddings.loadSavedModel", path, jspark) + + class _GPT2Loader(ExtendedJavaWrapper): def __init__(self, path, jspark): super(_GPT2Loader, self).__init__( diff --git a/python/test/annotator/embeddings/e5_embeddings_test.py b/python/test/annotator/embeddings/e5_embeddings_test.py new file mode 100644 index 00000000000000..97f0360c66a1e9 --- /dev/null +++ b/python/test/annotator/embeddings/e5_embeddings_test.py @@ -0,0 +1,56 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class E5EmbeddingsTestSpec(unittest.TestCase): + def setUp(self): + self.spark = SparkContextForTest.spark + self.tested_annotator = E5Embeddings \ + .pretrained() \ + .setInputCols(["documents"]) \ + .setOutputCol("e5") + + def runTest(self): + data = self.spark.createDataFrame([ + [1, "query: how much protein should a female eat"], + [2, "query: summit define"], + [3, "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 " + "is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're " + "expecting or training for a marathon. Check out the chart below to see how much protein you should " + "be eating each day.", ], + [4, "passage: Definition of summit for English Language Learners. : 1 the highest point of a mountain :" + " the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the " + "leaders of two or more governments."] + ]).toDF("id", "text") + + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("documents") + + e5 = self.tested_annotator + + pipeline = Pipeline().setStages([document_assembler, e5]) + results = pipeline.fit(data).transform(data) + + results.select("e5.embeddings").show(truncate=False) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/E5.scala b/src/main/scala/com/johnsnowlabs/ml/ai/E5.scala new file mode 100644 index 00000000000000..d01a37d6165d3a --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/ml/ai/E5.scala @@ -0,0 +1,174 @@ +/* + * Copyright 2017 - 2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.ml.ai + +import com.johnsnowlabs.ml.tensorflow.sentencepiece.SentencePieceWrapper +import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} +import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} +import com.johnsnowlabs.nlp.{Annotation, AnnotatorType} +import com.johnsnowlabs.nlp.annotators.common._ +import scala.collection.JavaConverters._ + +/** E5 Sentence embeddings model + * @param tensorflow + * tensorflow wrapper + * @param configProtoBytes + * config proto bytes + * @param sentenceStartTokenId + * sentence start token id + * @param sentenceEndTokenId + * sentence end token id + * @param signatures + * signatures + */ +private[johnsnowlabs] class E5( + val tensorflow: TensorflowWrapper, + configProtoBytes: Option[Array[Byte]] = None, + sentenceStartTokenId: Int, + sentenceEndTokenId: Int, + signatures: Option[Map[String, String]] = None) + extends Serializable { + + private val _tfInstructorSignatures: Map[String, String] = + signatures.getOrElse(ModelSignatureManager.apply()) + private val paddingTokenId = 0 + private val eosTokenId = 1 + + /** Get sentence embeddings for a batch of sentences + * @param batch + * batch of sentences + * @return + * sentence embeddings + */ + private def getSentenceEmbedding(batch: Seq[Array[Int]]): Array[Array[Float]] = { + // get max sentence length + val sequencesLength = batch.map(x => x.length).toArray + val maxSentenceLength = sequencesLength.max + val batchLength = batch.length + + // encode batch + val tensorEncoder = new TensorResources() + val inputDim = batch.length * maxSentenceLength + + // create buffers + val encoderInputBuffers = tensorEncoder.createIntBuffer(inputDim) + val encoderAttentionMaskBuffers = tensorEncoder.createIntBuffer(inputDim) + + val shape = Array(batch.length.toLong, maxSentenceLength) + + batch.zipWithIndex.foreach { case (tokenIds, idx) => + val offset = idx * maxSentenceLength + val diff = maxSentenceLength - tokenIds.length + + // pad with 0 + val s = tokenIds.take(maxSentenceLength) ++ Array.fill[Int](diff)(this.paddingTokenId) + encoderInputBuffers.offset(offset).write(s) + + // create attention mask + val mask = s.map(x => if (x != this.paddingTokenId) 1 else 0) + encoderAttentionMaskBuffers.offset(offset).write(mask) + + } + + // create tensors + val encoderInputTensors = tensorEncoder.createIntBufferTensor(shape, encoderInputBuffers) + val encoderAttentionMaskTensors = + tensorEncoder.createIntBufferTensor(shape, encoderAttentionMaskBuffers) + + // run model + val runner = tensorflow + .getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + initAllTables = false, + savedSignatures = signatures) + .runner + + runner + .feed( + _tfInstructorSignatures.getOrElse( + ModelSignatureConstants.EncoderInputIds.key, + "missing_encoder_input_ids"), + encoderInputTensors) + .feed( + _tfInstructorSignatures.getOrElse( + ModelSignatureConstants.EncoderAttentionMask.key, + "missing_encoder_attention_mask"), + encoderAttentionMaskTensors) + .fetch(_tfInstructorSignatures + .getOrElse(ModelSignatureConstants.LastHiddenState.key, "missing_last_hidden_state")) + + // get embeddings + val sentenceEmbeddings = runner.run().asScala + val sentenceEmbeddingsFloats = TensorResources.extractFloats(sentenceEmbeddings.head) + val dim = sentenceEmbeddingsFloats.length / batchLength + + // group embeddings + val sentenceEmbeddingsFloatsArray = sentenceEmbeddingsFloats.grouped(dim).toArray + + // close buffers + sentenceEmbeddings.foreach(_.close()) + encoderInputTensors.close() + encoderAttentionMaskTensors.close() + tensorEncoder.clearTensors() + tensorEncoder.clearSession(sentenceEmbeddings) + + sentenceEmbeddingsFloatsArray + } + + /** Predict sentence embeddings for a batch of sentences + * @param sentences + * sentences + * @param tokenizedSentences + * tokenized sentences + * @param batchSize + * batch size + * @param maxSentenceLength + * max sentence length + * @return + */ + def predict( + sentences: Seq[Annotation], + tokenizedSentences: Seq[WordpieceTokenizedSentence], + batchSize: Int, + maxSentenceLength: Int): Seq[Annotation] = { + + tokenizedSentences + .zip(sentences) + .zipWithIndex + .grouped(batchSize) + .toArray + .flatMap { batch => + val tokensBatch = batch.map(x => (x._1._1.tokens)) + val tokens = tokensBatch.map(x => + Array(sentenceStartTokenId) ++ x.map(y => y.pieceId).take(maxSentenceLength) ++ Array( + sentenceEndTokenId)) + val sentencesBatch = batch.map(x => x._1._2) + val sentenceEmbeddings = getSentenceEmbedding(tokens) + + batch.zip(sentenceEmbeddings).map { case (sentence, vectors) => + Annotation( + annotatorType = AnnotatorType.SENTENCE_EMBEDDINGS, + begin = sentence._1._2.begin, + end = sentence._1._2.end, + result = sentence._1._2.result, + metadata = sentence._1._2.metadata, + embeddings = vectors) + } + } + } + +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5Embeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5Embeddings.scala new file mode 100644 index 00000000000000..d6a7fc59f05aeb --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5Embeddings.scala @@ -0,0 +1,447 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.embeddings + +import com.johnsnowlabs.ml.ai.E5 +import com.johnsnowlabs.ml.tensorflow._ +import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ + ReadSentencePieceModel, + WriteSentencePieceModel +} +import com.johnsnowlabs.ml.util.LoadExternalModel.{ + loadSentencePieceAsset, + loadTextAsset, + modelSanityCheck, + notSupportedEngineError +} +import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.annotators.common._ +import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder} +import com.johnsnowlabs.nlp.serialization.MapFeature +import com.johnsnowlabs.storage.HasStorageRef +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +/** Sentence embeddings using E5. + * + * E5, an instruction-finetuned text embedding model that can generate text embeddings tailored + * to any task (e.g., classification, retrieval, clustering, text evaluation, etc.) + * + * Pretrained models can be loaded with `pretrained` of the companion object: + * {{{ + * val embeddings = E5Embeddings.pretrained() + * .setInputCols("document") + * .setOutputCol("e5_embeddings") + * }}} + * The default model is `"e5_small"`, if no name is provided. + * + * For available pretrained models please see the + * [[https://sparknlp.org/models?q=E5 Models Hub]]. + * + * For extended examples of usage, see + * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5EmbeddingsTestSpec.scala E5EmbeddingsTestSpec]]. + * + * '''Sources''' : + * + * [[https://arxiv.org/pdf/2212.03533 Text Embeddings by Weakly-Supervised Contrastive Pre-training]] + * + * [[https://github.com/microsoft/unilm/tree/master/e5 E5 Github Repository]] + * + * ''' Paper abstract ''' + * + * ''This paper presents E5, a family of state-of-the-art text embeddings that transfer well to a + * wide range of tasks. The model is trained in a contrastive manner with weak supervision + * signals from our curated large-scale text pair dataset (called CCPairs). E5 can be readily + * used as a general-purpose embedding model for any tasks requiring a single-vector + * representation of texts such as retrieval, clustering, and classification, achieving strong + * performance in both zero-shot and fine-tuned settings. We conduct extensive evaluations on 56 + * datasets from the BEIR and MTEB benchmarks. For zero-shot settings, E5 is the first model that + * outperforms the strong BM25 baseline on the BEIR retrieval benchmark without using any labeled + * data. When fine-tuned, E5 obtains the best results on the MTEB benchmark, beating existing + * embedding models with 40× more parameters.'' + * + * ==Example== + * {{{ + * import spark.implicits._ + * import com.johnsnowlabs.nlp.base.DocumentAssembler + * import com.johnsnowlabs.nlp.annotators.Tokenizer + * import com.johnsnowlabs.nlp.embeddings.E5Embeddings + * import com.johnsnowlabs.nlp.EmbeddingsFinisher + * import org.apache.spark.ml.Pipeline + * + * val documentAssembler = new DocumentAssembler() + * .setInputCol("text") + * .setOutputCol("document") + * + * val embeddings = E5Embeddings.pretrained("e5_small", "en") + * .setInputCols("document") + * .setOutputCol("e5_embeddings") + * + * val embeddingsFinisher = new EmbeddingsFinisher() + * .setInputCols("e5_embeddings") + * .setOutputCols("finished_embeddings") + * .setOutputAsVector(true) + * + * val pipeline = new Pipeline().setStages(Array( + * documentAssembler, + * embeddings, + * embeddingsFinisher + * )) + * + * val data = Seq("query: how much protein should a female eat", + * "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + + * But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + + * marathon. Check out the chart below to see how much protein you should be eating each day." + * + * ).toDF("text") + * val result = pipeline.fit(data).transform(data) + * + * result.selectExpr("explode(finished_embeddings) as result").show(1, 80) + * +--------------------------------------------------------------------------------+ + * | result| + * +--------------------------------------------------------------------------------+ + * |[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...| + * [[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...| + * +--------------------------------------------------------------------------------+ + * }}} + * + * @see + * [[https://sparknlp.org/docs/en/annotators Annotators Main Page]] for a list of transformer + * based embeddings + * @param uid + * required uid for storing annotator to disk + * @groupname anno Annotator types + * @groupdesc anno + * Required input and expected output annotator types + * @groupname Ungrouped Members + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupname Ungrouped Members + * @groupprio param 1 + * @groupprio anno 2 + * @groupprio Ungrouped 3 + * @groupprio setParam 4 + * @groupprio getParam 5 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ +class E5Embeddings(override val uid: String) + extends AnnotatorModel[E5Embeddings] + with HasBatchedAnnotate[E5Embeddings] + with WriteTensorflowModel + with HasEmbeddingsProperties + with HasStorageRef + with HasCaseSensitiveProperties + with HasEngine { + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + override val inputAnnotatorTypes: Array[String] = + Array(AnnotatorType.DOCUMENT) + override val outputAnnotatorType: AnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS + + /** ConfigProto from tensorflow, serialized into byte array. Get with + * `config_proto.SerializeToString()` + * + * @group param + */ + val configProtoBytes = new IntArrayParam( + this, + "configProtoBytes", + "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()") + + /** Max sentence length to process (Default: `128`) + * + * @group param + */ + val maxSentenceLength = + new IntParam(this, "maxSentenceLength", "Max sentence length to process") + + def sentenceStartTokenId: Int = { + $$(vocabulary)("[CLS]") + } + + /** @group setParam */ + def sentenceEndTokenId: Int = { + $$(vocabulary)("[SEP]") + } + + /** Vocabulary used to encode the words to ids with WordPieceEncoder + * + * @group param + */ + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() + + /** @group setParam */ + def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) + + /** It contains TF model signatures for the laded saved model + * + * @group param + */ + val signatures = + new MapFeature[String, String](model = this, name = "signatures").setProtected() + private var _model: Option[Broadcast[E5]] = None + + def this() = this(Identifiable.randomUID("E5_EMBEDDINGS")) + + /** @group setParam */ + def setConfigProtoBytes(bytes: Array[Int]): E5Embeddings.this.type = + set(this.configProtoBytes, bytes) + + /** @group setParam */ + def setMaxSentenceLength(value: Int): this.type = { + require( + value <= 512, + "E5 models do not support sequences longer than 512 because of trainable positional embeddings.") + require(value >= 1, "The maxSentenceLength must be at least 1") + set(maxSentenceLength, value) + this + } + + /** @group getParam */ + def getMaxSentenceLength: Int = $(maxSentenceLength) + + /** @group setParam */ + def setSignatures(value: Map[String, String]): this.type = { + if (get(signatures).isEmpty) + set(signatures, value) + this + } + + /** @group setParam */ + def setModelIfNotSet( + spark: SparkSession, + tensorflowWrapper: TensorflowWrapper): E5Embeddings = { + if (_model.isEmpty) { + _model = Some( + spark.sparkContext.broadcast( + new E5( + tensorflowWrapper, + configProtoBytes = getConfigProtoBytes, + sentenceStartTokenId = sentenceStartTokenId, + sentenceEndTokenId = sentenceEndTokenId, + signatures = getSignatures))) + } + + this + } + + /** Set Embeddings dimensions for the BERT model Only possible to set this when the first time + * is saved dimension is not changeable, it comes from BERT config file + * + * @group setParam + */ + override def setDimension(value: Int): this.type = { + if (get(dimension).isEmpty) + set(this.dimension, value) + this + } + + /** Whether to lowercase tokens or not + * + * @group setParam + */ + override def setCaseSensitive(value: Boolean): this.type = { + if (get(caseSensitive).isEmpty) + set(this.caseSensitive, value) + this + } + + setDefault(dimension -> 768, batchSize -> 8, maxSentenceLength -> 128, caseSensitive -> false) + + def tokenize(sentences: Seq[Annotation]): Seq[WordpieceTokenizedSentence] = { + val basicTokenizer = new BasicTokenizer($(caseSensitive)) + val encoder = new WordpieceEncoder($$(vocabulary)) + sentences.map { s => + val sent = Sentence( + content = s.result, + start = s.begin, + end = s.end, + metadata = Some(s.metadata), + index = s.begin) + val tokens = basicTokenizer.tokenize(sent) + val wordpieceTokens = tokens.flatMap(token => encoder.encode(token)) + WordpieceTokenizedSentence(wordpieceTokens) + } + } + + /** takes a document and annotations and produces new annotations of this annotator's annotation + * type + * + * @param batchedAnnotations + * Annotations that correspond to inputAnnotationCols generated by previous annotators if any + * @return + * any number of annotations processed for every input annotation. Not necessary one to one + * relationship + */ + override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = { + + val allAnnotations = batchedAnnotations + .filter(_.nonEmpty) + .zipWithIndex + .flatMap { case (annotations, i) => + annotations.filter(_.result.nonEmpty).map(x => (x, i)) + } + + // Tokenize sentences + val tokenizedSentences = tokenize(allAnnotations.map(_._1)) + val processedAnnotations = if (allAnnotations.nonEmpty) { + this.getModelIfNotSet.predict( + sentences = allAnnotations.map(_._1), + tokenizedSentences = tokenizedSentences, + batchSize = $(batchSize), + maxSentenceLength = $(maxSentenceLength)) + } else { + Seq() + } + + // Group resulting annotations by rows. If there are not sentences in a given row, return empty sequence + batchedAnnotations.indices.map(rowIndex => { + val rowAnnotations = processedAnnotations + // zip each annotation with its corresponding row index + .zip(allAnnotations) + // select the sentences belonging to the current row + .filter(_._2._2 == rowIndex) + // leave the annotation only + .map(_._1) + + if (rowAnnotations.nonEmpty) + rowAnnotations + else + Seq.empty[Annotation] + }) + + } + + /** @group getParam */ + def getModelIfNotSet: E5 = _model.get.value + + override def onWrite(path: String, spark: SparkSession): Unit = { + super.onWrite(path, spark) + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflow, + "_e5", + E5Embeddings.tfFile, + configProtoBytes = getConfigProtoBytes, + savedSignatures = getSignatures) + } + + /** @group getParam */ + def getConfigProtoBytes: Option[Array[Byte]] = get(this.configProtoBytes).map(_.map(_.toByte)) + + /** @group getParam */ + def getSignatures: Option[Map[String, String]] = get(this.signatures) + + override protected def afterAnnotate(dataset: DataFrame): DataFrame = { + dataset.withColumn( + getOutputCol, + wrapSentenceEmbeddingsMetadata( + dataset.col(getOutputCol), + $(dimension), + Some($(storageRef)))) + } + +} + +trait ReadablePretrainedE5Model + extends ParamsAndFeaturesReadable[E5Embeddings] + with HasPretrained[E5Embeddings] { + override val defaultModelName: Some[String] = Some("e5_small") + + /** Java compliant-overrides */ + override def pretrained(): E5Embeddings = super.pretrained() + + override def pretrained(name: String): E5Embeddings = super.pretrained(name) + + override def pretrained(name: String, lang: String): E5Embeddings = + super.pretrained(name, lang) + + override def pretrained(name: String, lang: String, remoteLoc: String): E5Embeddings = + super.pretrained(name, lang, remoteLoc) +} + +trait ReadE5DLModel extends ReadTensorflowModel { + this: ParamsAndFeaturesReadable[E5Embeddings] => + + override val tfFile: String = "e5_tensorflow" + def readModel(instance: E5Embeddings, path: String, spark: SparkSession): Unit = { + + val tf = readTensorflowModel( + path, + spark, + "_e5_tf", + savedSignatures = instance.getSignatures, + initAllTables = false) + instance.setModelIfNotSet(spark, tf) + } + + addReader(readModel) + + def loadSavedModel(modelPath: String, spark: SparkSession): E5Embeddings = { + + val (localModelPath, detectedEngine) = modelSanityCheck(modelPath) + val vocabs = loadTextAsset(localModelPath, "vocab.txt").zipWithIndex.toMap + /*Universal parameters for all engines*/ + val annotatorModel = new E5Embeddings().setVocabulary(vocabs) + + annotatorModel.set(annotatorModel.engine, detectedEngine) + detectedEngine match { + case ModelEngine.tensorflow => + val (wrapper, signatures) = TensorflowWrapper.read( + localModelPath, + zipped = false, + useBundle = true, + tags = Array("serve"), + initAllTables = false) + + val _signatures = signatures match { + case Some(s) => s + case None => throw new Exception("Cannot load signature definitions from model!") + } + + /** the order of setSignatures is important if we use getSignatures inside + * setModelIfNotSet + */ + annotatorModel + .setSignatures(_signatures) + .setModelIfNotSet(spark, wrapper) + + case _ => + throw new Exception(notSupportedEngineError) + } + + annotatorModel + } +} + +/** This is the companion object of [[E5Embeddings]]. Please refer to that class for the + * documentation. + */ +object E5Embeddings extends ReadablePretrainedE5Model with ReadE5DLModel { + private[E5Embeddings] val logger: Logger = + LoggerFactory.getLogger("E5Embeddings") +} diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5EmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5EmbeddingsTestSpec.scala new file mode 100644 index 00000000000000..1e4ac3c697c10e --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5EmbeddingsTestSpec.scala @@ -0,0 +1,57 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.embeddings + +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.tags.{SlowTest, FastTest} +import org.apache.spark.ml.Pipeline +import org.scalatest.flatspec.AnyFlatSpec + +class E5EmbeddingsTestSpec extends AnyFlatSpec { + + "E5 Embeddings" should "correctly embed multiple sentences" taggedAs SlowTest in { + + import ResourceHelper.spark.implicits._ + + val ddd = Seq( + "query: how much protein should a female eat", + "query: summit define", + "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 " + + "grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or" + + " training for a marathon. Check out the chart below to see how much protein you should be eating each day.", + "passage: Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of" + + " a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more" + + " governments.") + .toDF("text") + + val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val embeddings = E5Embeddings + .pretrained() + .setInputCols(Array("document")) + .setOutputCol("e5") + + val pipeline = new Pipeline().setStages(Array(document, embeddings)) + + val pipelineDF = pipeline.fit(ddd).transform(ddd) + pipelineDF.select("e5.embeddings").show(truncate = false) + + } +} From 7645ad45ed01f795174cc18952fcfae15ca314ae Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Sat, 1 Jul 2023 23:17:32 +1000 Subject: [PATCH 06/13] SPARKNLP-846: BART: Added maxInputLength. (#13863) * Added maxInputLength. fixes #13829 * changed test types --- .../seq2seq/bart_transformer_test.py | 75 +++++++++++++++++++ .../scala/com/johnsnowlabs/ml/ai/Bart.scala | 15 ++-- .../annotators/seq2seq/BartTransformer.scala | 16 +++- .../nlp/annotators/seq2seq/BartTestSpec.scala | 73 +++++++++++++++++- 4 files changed, 172 insertions(+), 7 deletions(-) diff --git a/python/test/annotator/seq2seq/bart_transformer_test.py b/python/test/annotator/seq2seq/bart_transformer_test.py index d489d488ae8ae8..99f42d52bd9895 100644 --- a/python/test/annotator/seq2seq/bart_transformer_test.py +++ b/python/test/annotator/seq2seq/bart_transformer_test.py @@ -46,6 +46,81 @@ def runTest(self): results.select("documents.result", "answers.result").show(truncate=False) +@pytest.mark.slow +class BartTransformerMaxLengthTestSpec(unittest.TestCase): + def setUp(self): + self.spark = SparkContextForTest.spark + + def runTest(self): + data = self.spark.createDataFrame([ + [1, """ + Heat oven to 200C/180C fan/gas 6. Line each hole of a 12-hole muffin tin with a thin strip of baking + parchment across the middle that’s long enough so the ends stick out a centimetre or two – use a dab of + butter to stick in place. Roll out two thirds of the pastry on a lightly floured surface and stamp out + 12 x 10cm circles (you may need to re-roll trimmings). Press a circle into each hole to line. + + Sprinkle 1 tsp of breadcrumbs into the base of each pie. Tip the rest of the crumbs into a mixing bowl. + Squeeze in the sausage meat, discarding the skins, along with the bacon, mace, pepper, sage and just a + little salt. Get your hands in and mash and squish everything together until the breadcrumbs have just + about disappeared. Divide mixture between the holes, packing in firmly and shaping to a dome in the middle. + + Roll out the remaining pastry and stamp out 12 x 7cm circles. Brush with a little egg and add a top to + each pie, egg-side down to stick, carefully pressing pastry edges together to seal. Brush with more egg + (don’t throw away leftovers) and sprinkle with sesame seeds. Bake for 30 mins until golden then carefully + remove the pies from the tin, using the parchment ends to help you lift them out. Sit on a parchment lined + baking tray, brush all round the sides with more egg and put back in the oven for 8 mins. Cool completely + then eat with piccalilli, or your favourite pickle. + + Heat oven to 200C/180C fan/gas 6. Line each hole of a 12-hole muffin tin with a thin strip of baking + parchment across the middle that’s long enough so the ends stick out a centimetre or two – use a dab of + butter to stick in place. Roll out two thirds of the pastry on a lightly floured surface and stamp out + 12 x 10cm circles (you may need to re-roll trimmings). Press a circle into each hole to line. + + Sprinkle 1 tsp of breadcrumbs into the base of each pie. Tip the rest of the crumbs into a mixing bowl. + Squeeze in the sausage meat, discarding the skins, along with the bacon, mace, pepper, sage and just a + little salt. Get your hands in and mash and squish everything together until the breadcrumbs have just + about disappeared. Divide mixture between the holes, packing in firmly and shaping to a dome in the middle. + + Roll out the remaining pastry and stamp out 12 x 7cm circles. Brush with a little egg and add a top to + each pie, egg-side down to stick, carefully pressing pastry edges together to seal. Brush with more egg + (don’t throw away leftovers) and sprinkle with sesame seeds. Bake for 30 mins until golden then carefully + remove the pies from the tin, using the parchment ends to help you lift them out. Sit on a parchment lined + baking tray, brush all round the sides with more egg and put back in the oven for 8 mins. Cool completely + then eat with piccalilli, or your favourite pickle. + + Heat oven to 200C/180C fan/gas 6. Line each hole of a 12-hole muffin tin with a thin strip of baking + parchment across the middle that’s long enough so the ends stick out a centimetre or two – use a dab of + butter to stick in place. Roll out two thirds of the pastry on a lightly floured surface and stamp out + 12 x 10cm circles (you may need to re-roll trimmings). Press a circle into each hole to line. + + Sprinkle 1 tsp of breadcrumbs into the base of each pie. Tip the rest of the crumbs into a mixing bowl. + Squeeze in the sausage meat, discarding the skins, along with the bacon, mace, pepper, sage and just a + little salt. Get your hands in and mash and squish everything together until the breadcrumbs have just + about disappeared. Divide mixture between the holes, packing in firmly and shaping to a dome in the middle. + + Roll out the remaining pastry and stamp out 12 x 7cm circles. Brush with a little egg and add a top to + each pie, egg-side down to stick, carefully pressing pastry edges together to seal. Brush with more egg + (don’t throw away leftovers) and sprinkle with sesame seeds. Bake for 30 mins until golden then carefully + remove the pies from the tin, using the parchment ends to help you lift them out. Sit on a parchment lined + baking tray, brush all round the sides with more egg and put back in the oven for 8 mins. Cool completely + then eat with piccalilli, or your favourite pickle. + """.strip().replace("\n", " ")]]).toDF("id", "text") + + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("documents") + + bart = BartTransformer.pretrained("distilbart_xsum_12_6") \ + .setTask("summarize:") \ + .setMaxOutputLength(30) \ + .setInputCols(["documents"]) \ + .setOutputCol("summaries") + + pipeline = Pipeline().setStages([document_assembler, bart]) + results = pipeline.fit(data).transform(data) + + results.select("summaries.result").show(truncate=False) + @pytest.mark.slow class BartTransformerSummaryTestSpec(unittest.TestCase): diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala index cb46d348450483..3edba90c0f2bd1 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala @@ -101,7 +101,8 @@ private[johnsnowlabs] class Bart( task: String, randomSeed: Option[Long] = None, ignoreTokenIds: Array[Int] = Array(), - beamSize: Int): Seq[Annotation] = { + beamSize: Int, + maxInputLength: Int): Seq[Annotation] = { val batchDecoder = sentences.grouped(batchSize).toArray.flatMap { batch => val batchSP = encode(batch, task) @@ -117,7 +118,8 @@ private[johnsnowlabs] class Bart( noRepeatNgramSize, randomSeed, ignoreTokenIds, - beamSize) + beamSize, + maxInputLength) decode(spIds) @@ -176,10 +178,12 @@ private[johnsnowlabs] class Bart( noRepeatNgramSize: Int, randomSeed: Option[Long], ignoreTokenIds: Array[Int] = Array(), - beamSize: Int): Array[Array[Int]] = { + beamSize: Int, + maxInputLength: Int): Array[Array[Int]] = { val ignoreTokenIdsInt = ignoreTokenIds - val expandedEncoderInputIdsVals = batch.flatMap(x => List.fill(beamSize)(x)) + val expandedEncoderInputIdsVals = + batch.flatMap(x => List.fill(beamSize)(x.take(maxInputLength))) val sequencesLength = expandedEncoderInputIdsVals.map(x => x.length).toArray val maxSentenceLength = sequencesLength.max // - curLen @@ -492,6 +496,7 @@ private[johnsnowlabs] class Bart( noRepeatNgramSize = 0, randomSeed = Option(0), ignoreTokenIds = Array(0), - beamSize = 1) + beamSize = 1, + maxInputLength = 512) } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala index aa72c0e4738fbe..5e8a97a693a2df 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala @@ -199,6 +199,18 @@ class BartTransformer(override val uid: String) this } + /** max length of the input sequence (Default: `0`) + * + * @group param + */ + val maxInputLength = + new IntParam(this, "maxInputLength", "Maximum length of the input sequence") + + def setMaxInputLength(value: Int): BartTransformer.this.type = { + set(maxInputLength, value) + this + } + /** @group getParam */ def getMinOutputLength: Int = $(this.minOutputLength) @@ -477,6 +489,7 @@ class BartTransformer(override val uid: String) ignoreTokenIds -> Array(), batchSize -> 1, beamSize -> 4, + maxInputLength -> 512, useCache -> true) override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = { @@ -503,7 +516,8 @@ class BartTransformer(override val uid: String) task = $(task), randomSeed = this.randomSeed, ignoreTokenIds = $(ignoreTokenIds), - beamSize = $(beamSize)) + beamSize = $(beamSize), + maxInputLength = $(maxInputLength)) } else { Seq() } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTestSpec.scala index 11b5e8d768aa3d..553567d53a4df3 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTestSpec.scala @@ -18,7 +18,7 @@ package com.johnsnowlabs.nlp.annotators.seq2seq import com.johnsnowlabs.nlp.base.DocumentAssembler import com.johnsnowlabs.nlp.util.io.ResourceHelper -import com.johnsnowlabs.tags.SlowTest +import com.johnsnowlabs.tags.{SlowTest, FastTest} import com.johnsnowlabs.util.Benchmark import org.apache.spark.ml.Pipeline import org.scalatest.flatspec.AnyFlatSpec @@ -56,6 +56,77 @@ class BartTestSpec extends AnyFlatSpec { .show(truncate = false) } + "distilbart_xsum_12_6" should "handle text inputs longer than 512 and not crash" taggedAs SlowTest in { + // text longer than 512 + val testData = ResourceHelper.spark + .createDataFrame( + Seq( + ( + 1, + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."))) + .toDF("id", "text") + + val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("documents") + + val bart = BartTransformer + .pretrained("distilbart_xsum_12_6") + .setTask("summarize:") + .setInputCols(Array("documents")) + .setDoSample(true) + .setMaxOutputLength(30) + .setOutputCol("generation") + + new Pipeline() + .setStages(Array(documentAssembler, bart)) + .fit(testData) + .transform(testData) + .select("generation.result") + .show(truncate = false) + } + "bart-large-cnn" should "run SparkNLP pipeline with maxLength=130 and doSample=true" taggedAs SlowTest in { val testData = ResourceHelper.spark .createDataFrame( From fb75bb1be9ed9f3fdafacaddb37b8028fa556f26 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Sat, 1 Jul 2023 17:24:59 +0200 Subject: [PATCH 07/13] Fix Model Engine detection [skip test] --- .../com/johnsnowlabs/nlp/embeddings/E5Embeddings.scala | 9 ++------- .../nlp/embeddings/InstructorEmbeddings.scala | 4 ++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5Embeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5Embeddings.scala index d6a7fc59f05aeb..8c902a31a6785f 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5Embeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/E5Embeddings.scala @@ -18,17 +18,12 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.ai.E5 import com.johnsnowlabs.ml.tensorflow._ -import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ - ReadSentencePieceModel, - WriteSentencePieceModel -} import com.johnsnowlabs.ml.util.LoadExternalModel.{ - loadSentencePieceAsset, loadTextAsset, modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder} @@ -410,7 +405,7 @@ trait ReadE5DLModel extends ReadTensorflowModel { annotatorModel.set(annotatorModel.engine, detectedEngine) detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read( localModelPath, zipped = false, diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddings.scala index 8caee7749a00ad..ede6caa13ee84f 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddings.scala @@ -28,7 +28,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature import com.johnsnowlabs.storage.HasStorageRef @@ -400,7 +400,7 @@ trait ReadInstructorDLModel extends ReadTensorflowModel with ReadSentencePieceMo annotatorModel.set(annotatorModel.engine, detectedEngine) val spModel = loadSentencePieceAsset(localModelPath, "spiece.model") detectedEngine match { - case ModelEngine.tensorflow => + case TensorFlow.name => val (wrapper, signatures) = TensorflowWrapper.read( localModelPath, zipped = false, From ad1902415ac5729c512fec74ae07245212d16bf0 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Sat, 1 Jul 2023 17:25:17 +0200 Subject: [PATCH 08/13] Add InstructorEmbeddings and E5Embeddings to Python downloader [skip test] --- .../com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index 4abafc4b99febb..f38f9735604fb7 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -717,7 +717,9 @@ object PythonResourceDownloader { "BartTransformer" -> BartTransformer, "BertForZeroShotClassification" -> BertForZeroShotClassification, "DistilBertForZeroShotClassification" -> DistilBertForZeroShotClassification, - "RoBertaForZeroShotClassification" -> RoBertaForZeroShotClassification) + "RoBertaForZeroShotClassification" -> RoBertaForZeroShotClassification, + "InstructorEmbeddings" -> InstructorEmbeddings, + "E5Embeddings" -> E5Embeddings) // List pairs of types such as the one with key type can load a pretrained model from the value type val typeMapper: Map[String, String] = Map("ZeroShotNerModel" -> "RoBertaForQuestionAnswering") From 08cad55eba3a1dcfd622b91d4fc8465361f9f8a2 Mon Sep 17 00:00:00 2001 From: Stefano Lori Date: Mon, 3 Jul 2023 09:10:20 +0200 Subject: [PATCH 09/13] Feature/doc similarity ranker (#13858) * Added doc similarity ranker annotator template * Created ranker model * gitignore modified * Added params to LSH models * Added BRP LSH as annotator engine * Added replace features col with embeddings * Added LSH logic on vector cast * Added skeleton for lsh doc sim ranker - WIP * Fixed mh3 hash calculation * Fixed dataset assertions id vs neghbours * Converting neighbours result string to map * Added finisher to extract lsh id and neighbors * Labels refactoring * Added distance param to show in rankings * Added logic to select nearest neighbor * Added identity ranking for debugging * Adding Python interface to doc sim ranker approach and model * WIP - Python interface * WIP - fixed umbalanced embeddings Py test * Added MinHash engine to doc sim ranker * Fixed serde for ranker map params * Clean up pytests * Added doc sim ranker finisher Python interface * stabilized tests for doc sim ranker * Moved and enriched test for doc sim ranker * Bumped version 5.0.0 in doc sim ranker test --------- Co-authored-by: Stefano Lori --- .gitignore | 1 + .../doc-sim-ranker/test_doc_sim_ranker.ipynb | 542 ++++++++++++++++++ .../sparknlp/annotator/similarity/__init__.py | 0 .../similarity/document_similarity_ranker.py | 232 ++++++++ python/sparknlp/common/annotator_type.py | 1 + python/test/annotator/similarity/__init__.py | 0 .../similarity/doc_similarity_ranker_test.py | 90 +++ .../com/johnsnowlabs/nlp/AnnotatorType.scala | 2 +- .../DocumentSimilarityRankerApproach.scala | 223 +++++++ .../DocumentSimilarityRankerModel.scala | 78 +++ .../DocumentSimilarityRankerFinisher.scala | 181 ++++++ .../DocumentSimilarityRankerTestSpec.scala | 275 +++++++++ 12 files changed, 1624 insertions(+), 1 deletion(-) create mode 100644 examples/python/annotation/text/english/text-similarity/doc-sim-ranker/test_doc_sim_ranker.ipynb create mode 100644 python/sparknlp/annotator/similarity/__init__.py create mode 100644 python/sparknlp/annotator/similarity/document_similarity_ranker.py create mode 100644 python/test/annotator/similarity/__init__.py create mode 100644 python/test/annotator/similarity/doc_similarity_ranker_test.py create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/similarity/DocumentSimilarityRankerApproach.scala create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/similarity/DocumentSimilarityRankerModel.scala create mode 100644 src/main/scala/com/johnsnowlabs/nlp/finisher/DocumentSimilarityRankerFinisher.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/similarity/DocumentSimilarityRankerTestSpec.scala diff --git a/.gitignore b/.gitignore index aa264460d82d79..e91a8952f7c8d5 100644 --- a/.gitignore +++ b/.gitignore @@ -338,3 +338,4 @@ python/docs/reference/_autosummary/** # MS Visio Code **/.vscode/ +.metals/ \ No newline at end of file diff --git a/examples/python/annotation/text/english/text-similarity/doc-sim-ranker/test_doc_sim_ranker.ipynb b/examples/python/annotation/text/english/text-similarity/doc-sim-ranker/test_doc_sim_ranker.ipynb new file mode 100644 index 00000000000000..eb77b388e42dc7 --- /dev/null +++ b/examples/python/annotation/text/english/text-similarity/doc-sim-ranker/test_doc_sim_ranker.ipynb @@ -0,0 +1,542 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "c3dc7ce5", + "metadata": {}, + "source": [ + "# Document Similarity Ranker for Spark NLP\n", + "### Efficient approximate nearest neighbor search on top of sentence embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1a9dd32e", + "metadata": {}, + "outputs": [], + "source": [ + "# Import Spark NLP classes\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from sparknlp.pretrained import PretrainedPipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "82846deb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":: loading settings :: url = jar:file:/Users/stefanolori/opt/anaconda3/envs/spknlp/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ivy Default Cache set to: /Users/stefanolori/.ivy2/cache\n", + "The jars for the packages stored in: /Users/stefanolori/.ivy2/jars\n", + "com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-d858c4fe-292f-4adf-8944-9ebef53c59cd;1.0\n", + "\tconfs: [default]\n", + "\tfound com.johnsnowlabs.nlp#spark-nlp_2.12;4.4.4 in local-ivy-cache\n", + "\tfound com.typesafe#config;1.4.2 in local-m2-cache\n", + "\tfound org.rocksdb#rocksdbjni;6.29.5 in central\n", + "\tfound com.amazonaws#aws-java-sdk-bundle;1.11.828 in central\n", + "\tfound com.github.universal-automata#liblevenshtein;3.0.0 in central\n", + "\tfound com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central\n", + "\tfound com.google.protobuf#protobuf-java;3.0.0-beta-3 in central\n", + "\tfound com.google.code.gson#gson;2.3 in central\n", + "\tfound it.unimi.dsi#fastutil;7.0.12 in central\n", + "\tfound org.projectlombok#lombok;1.16.8 in central\n", + "\tfound com.google.cloud#google-cloud-storage;2.16.0 in central\n", + "\tfound com.google.guava#guava;31.1-jre in central\n", + "\tfound com.google.guava#failureaccess;1.0.1 in central\n", + "\tfound com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central\n", + "\tfound com.google.errorprone#error_prone_annotations;2.16 in central\n", + "\tfound com.google.j2objc#j2objc-annotations;1.3 in central\n", + "\tfound com.google.http-client#google-http-client;1.42.3 in central\n", + "\tfound io.opencensus#opencensus-contrib-http-util;0.31.1 in central\n", + "\tfound com.google.http-client#google-http-client-jackson2;1.42.3 in central\n", + "\tfound com.google.http-client#google-http-client-gson;1.42.3 in central\n", + "\tfound com.google.api-client#google-api-client;2.1.1 in central\n", + "\tfound commons-codec#commons-codec;1.15 in central\n", + "\tfound com.google.oauth-client#google-oauth-client;1.34.1 in central\n", + "\tfound com.google.http-client#google-http-client-apache-v2;1.42.3 in central\n", + "\tfound com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 in central\n", + "\tfound com.google.code.gson#gson;2.10 in central\n", + "\tfound com.google.cloud#google-cloud-core;2.9.0 in central\n", + "\tfound com.google.auto.value#auto-value-annotations;1.10.1 in central\n", + "\tfound com.google.cloud#google-cloud-core-http;2.9.0 in central\n", + "\tfound com.google.http-client#google-http-client-appengine;1.42.3 in central\n", + "\tfound com.google.api#gax-httpjson;0.105.1 in central\n", + "\tfound com.google.cloud#google-cloud-core-grpc;2.9.0 in central\n", + "\tfound io.grpc#grpc-core;1.51.0 in central\n", + "\tfound com.google.api#gax;2.20.1 in central\n", + "\tfound com.google.api#gax-grpc;2.20.1 in central\n", + "\tfound io.grpc#grpc-alts;1.51.0 in central\n", + "\tfound io.grpc#grpc-grpclb;1.51.0 in central\n", + "\tfound org.conscrypt#conscrypt-openjdk-uber;2.5.2 in central\n", + "\tfound io.grpc#grpc-protobuf;1.51.0 in central\n", + "\tfound com.google.auth#google-auth-library-credentials;1.13.0 in central\n", + "\tfound com.google.auth#google-auth-library-oauth2-http;1.13.0 in central\n", + "\tfound com.google.api#api-common;2.2.2 in central\n", + "\tfound javax.annotation#javax.annotation-api;1.3.2 in local-m2-cache\n", + "\tfound io.opencensus#opencensus-api;0.31.1 in central\n", + "\tfound io.grpc#grpc-context;1.51.0 in central\n", + "\tfound com.google.api.grpc#proto-google-iam-v1;1.6.22 in central\n", + "\tfound com.google.protobuf#protobuf-java;3.21.10 in central\n", + "\tfound com.google.protobuf#protobuf-java-util;3.21.10 in central\n", + "\tfound com.google.api.grpc#proto-google-common-protos;2.11.0 in central\n", + "\tfound org.threeten#threetenbp;1.6.4 in central\n", + "\tfound com.google.api.grpc#proto-google-cloud-storage-v2;2.16.0-alpha in central\n", + "\tfound com.google.api.grpc#grpc-google-cloud-storage-v2;2.16.0-alpha in central\n", + "\tfound com.google.api.grpc#gapic-google-cloud-storage-v2;2.16.0-alpha in central\n", + "\tfound com.fasterxml.jackson.core#jackson-core;2.14.1 in central\n", + "\tfound com.google.code.findbugs#jsr305;3.0.2 in central\n", + "\tfound io.grpc#grpc-api;1.51.0 in central\n", + "\tfound io.grpc#grpc-auth;1.51.0 in central\n", + "\tfound io.grpc#grpc-stub;1.51.0 in central\n", + "\tfound org.checkerframework#checker-qual;3.28.0 in central\n", + "\tfound com.google.api.grpc#grpc-google-iam-v1;1.6.22 in central\n", + "\tfound io.grpc#grpc-protobuf-lite;1.51.0 in central\n", + "\tfound com.google.android#annotations;4.1.1.4 in central\n", + "\tfound org.codehaus.mojo#animal-sniffer-annotations;1.22 in central\n", + "\tfound io.grpc#grpc-netty-shaded;1.51.0 in central\n", + "\tfound io.perfmark#perfmark-api;0.26.0 in central\n", + "\tfound io.grpc#grpc-googleapis;1.51.0 in central\n", + "\tfound io.grpc#grpc-xds;1.51.0 in central\n", + "\tfound io.opencensus#opencensus-proto;0.2.0 in central\n", + "\tfound io.grpc#grpc-services;1.51.0 in central\n", + "\tfound com.google.re2j#re2j;1.6 in central\n", + "\tfound com.navigamez#greex;1.0 in central\n", + "\tfound dk.brics.automaton#automaton;1.11-8 in central\n", + "\tfound com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 in central\n", + ":: resolution report :: resolve 1092ms :: artifacts dl 43ms\n", + "\t:: modules in use:\n", + "\tcom.amazonaws#aws-java-sdk-bundle;1.11.828 from central in [default]\n", + "\tcom.fasterxml.jackson.core#jackson-core;2.14.1 from central in [default]\n", + "\tcom.github.universal-automata#liblevenshtein;3.0.0 from central in [default]\n", + "\tcom.google.android#annotations;4.1.1.4 from central in [default]\n", + "\tcom.google.api#api-common;2.2.2 from central in [default]\n", + "\tcom.google.api#gax;2.20.1 from central in [default]\n", + "\tcom.google.api#gax-grpc;2.20.1 from central in [default]\n", + "\tcom.google.api#gax-httpjson;0.105.1 from central in [default]\n", + "\tcom.google.api-client#google-api-client;2.1.1 from central in [default]\n", + "\tcom.google.api.grpc#gapic-google-cloud-storage-v2;2.16.0-alpha from central in [default]\n", + "\tcom.google.api.grpc#grpc-google-cloud-storage-v2;2.16.0-alpha from central in [default]\n", + "\tcom.google.api.grpc#grpc-google-iam-v1;1.6.22 from central in [default]\n", + "\tcom.google.api.grpc#proto-google-cloud-storage-v2;2.16.0-alpha from central in [default]\n", + "\tcom.google.api.grpc#proto-google-common-protos;2.11.0 from central in [default]\n", + "\tcom.google.api.grpc#proto-google-iam-v1;1.6.22 from central in [default]\n", + "\tcom.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 from central in [default]\n", + "\tcom.google.auth#google-auth-library-credentials;1.13.0 from central in [default]\n", + "\tcom.google.auth#google-auth-library-oauth2-http;1.13.0 from central in [default]\n", + "\tcom.google.auto.value#auto-value-annotations;1.10.1 from central in [default]\n", + "\tcom.google.cloud#google-cloud-core;2.9.0 from central in [default]\n", + "\tcom.google.cloud#google-cloud-core-grpc;2.9.0 from central in [default]\n", + "\tcom.google.cloud#google-cloud-core-http;2.9.0 from central in [default]\n", + "\tcom.google.cloud#google-cloud-storage;2.16.0 from central in [default]\n", + "\tcom.google.code.findbugs#jsr305;3.0.2 from central in [default]\n", + "\tcom.google.code.gson#gson;2.10 from central in [default]\n", + "\tcom.google.errorprone#error_prone_annotations;2.16 from central in [default]\n", + "\tcom.google.guava#failureaccess;1.0.1 from central in [default]\n", + "\tcom.google.guava#guava;31.1-jre from central in [default]\n", + "\tcom.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from central in [default]\n", + "\tcom.google.http-client#google-http-client;1.42.3 from central in [default]\n", + "\tcom.google.http-client#google-http-client-apache-v2;1.42.3 from central in [default]\n", + "\tcom.google.http-client#google-http-client-appengine;1.42.3 from central in [default]\n", + "\tcom.google.http-client#google-http-client-gson;1.42.3 from central in [default]\n", + "\tcom.google.http-client#google-http-client-jackson2;1.42.3 from central in [default]\n", + "\tcom.google.j2objc#j2objc-annotations;1.3 from central in [default]\n", + "\tcom.google.oauth-client#google-oauth-client;1.34.1 from central in [default]\n", + "\tcom.google.protobuf#protobuf-java;3.21.10 from central in [default]\n", + "\tcom.google.protobuf#protobuf-java-util;3.21.10 from central in [default]\n", + "\tcom.google.re2j#re2j;1.6 from central in [default]\n", + "\tcom.johnsnowlabs.nlp#spark-nlp_2.12;4.4.4 from local-ivy-cache in [default]\n", + "\tcom.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 from central in [default]\n", + "\tcom.navigamez#greex;1.0 from central in [default]\n", + "\tcom.typesafe#config;1.4.2 from local-m2-cache in [default]\n", + "\tcommons-codec#commons-codec;1.15 from central in [default]\n", + "\tdk.brics.automaton#automaton;1.11-8 from central in [default]\n", + "\tio.grpc#grpc-alts;1.51.0 from central in [default]\n", + "\tio.grpc#grpc-api;1.51.0 from central in [default]\n", + "\tio.grpc#grpc-auth;1.51.0 from central in [default]\n", + "\tio.grpc#grpc-context;1.51.0 from central in [default]\n", + "\tio.grpc#grpc-core;1.51.0 from central in [default]\n", + "\tio.grpc#grpc-googleapis;1.51.0 from central in [default]\n", + "\tio.grpc#grpc-grpclb;1.51.0 from central in [default]\n", + "\tio.grpc#grpc-netty-shaded;1.51.0 from central in [default]\n", + "\tio.grpc#grpc-protobuf;1.51.0 from central in [default]\n", + "\tio.grpc#grpc-protobuf-lite;1.51.0 from central in [default]\n", + "\tio.grpc#grpc-services;1.51.0 from central in [default]\n", + "\tio.grpc#grpc-stub;1.51.0 from central in [default]\n", + "\tio.grpc#grpc-xds;1.51.0 from central in [default]\n", + "\tio.opencensus#opencensus-api;0.31.1 from central in [default]\n", + "\tio.opencensus#opencensus-contrib-http-util;0.31.1 from central in [default]\n", + "\tio.opencensus#opencensus-proto;0.2.0 from central in [default]\n", + "\tio.perfmark#perfmark-api;0.26.0 from central in [default]\n", + "\tit.unimi.dsi#fastutil;7.0.12 from central in [default]\n", + "\tjavax.annotation#javax.annotation-api;1.3.2 from local-m2-cache in [default]\n", + "\torg.checkerframework#checker-qual;3.28.0 from central in [default]\n", + "\torg.codehaus.mojo#animal-sniffer-annotations;1.22 from central in [default]\n", + "\torg.conscrypt#conscrypt-openjdk-uber;2.5.2 from central in [default]\n", + "\torg.projectlombok#lombok;1.16.8 from central in [default]\n", + "\torg.rocksdb#rocksdbjni;6.29.5 from central in [default]\n", + "\torg.threeten#threetenbp;1.6.4 from central in [default]\n", + "\t:: evicted modules:\n", + "\tcom.google.protobuf#protobuf-java-util;3.0.0-beta-3 by [com.google.protobuf#protobuf-java-util;3.21.10] in [default]\n", + "\tcom.google.protobuf#protobuf-java;3.0.0-beta-3 by [com.google.protobuf#protobuf-java;3.21.10] in [default]\n", + "\tcom.google.code.gson#gson;2.3 by [com.google.code.gson#gson;2.10] in [default]\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 73 | 0 | 0 | 3 || 70 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-d858c4fe-292f-4adf-8944-9ebef53c59cd\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 70 already retrieved (0kB/16ms)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "23/07/01 22:00:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + ] + } + ], + "source": [ + "# Create the PySpark session\n", + "from pyspark.sql import SparkSession\n", + "\n", + "spark = SparkSession.builder \\\n", + " .appName(\"Spark NLP\")\\\n", + " .master(\"local[*]\")\\\n", + " .config(\"spark.driver.memory\",\"16G\")\\\n", + " .config(\"spark.driver.maxResultSize\", \"0\") \\\n", + " .config(\"spark.kryoserializer.buffer.max\", \"2000M\")\\\n", + " .config(\"spark.jars.packages\", \"com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0\")\\\n", + " .getOrCreate()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a3f563d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's use some dataset where we can visually control similarity\n", + "# Documents are coupled, as 1-2, 3-4, 5-6, 7-8 and they were voluntarily created similar\n", + "data = spark.createDataFrame(\n", + " [\n", + " [\"First document, this is my first sentence. This is my second sentence.\"],\n", + " [\"Second document, this is my second sentence. This is my second sentence.\"],\n", + " [\"Third document, climate change is arguably one of the most pressing problems of our time.\"],\n", + " [\"Fourth document, climate change is definitely one of the most pressing problems of our time.\"],\n", + " [\"Fifth document, Florence in Italy, is among the most beautiful cities in Europe.\"],\n", + " [\"Sixth document, Florence in Italy, is a very beautiful city in Europe like Lyon in France.\"],\n", + " [\"Seventh document, the French Riviera is the Mediterranean coastline of the southeast corner of France.\"],\n", + " [\"Eighth document, the warmest place in France is the French Riviera coast in Southern France.\"]\n", + " ]\n", + " ).toDF(\"text\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "34604126", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 0:> (0 + 1) / 1]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------------------------------------------------------------------------------------------------+\n", + "|text |\n", + "+------------------------------------------------------------------------------------------------------+\n", + "|First document, this is my first sentence. This is my second sentence. |\n", + "|Second document, this is my second sentence. This is my second sentence. |\n", + "|Third document, climate change is arguably one of the most pressing problems of our time. |\n", + "|Fourth document, climate change is definitely one of the most pressing problems of our time. |\n", + "|Fifth document, Florence in Italy, is among the most beautiful cities in Europe. |\n", + "|Sixth document, Florence in Italy, is a very beautiful city in Europe like Lyon in France. |\n", + "|Seventh document, the French Riviera is the Mediterranean coastline of the southeast corner of France.|\n", + "|Eighth document, the warmest place in France is the French Riviera coast in Southern France. |\n", + "+------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "data.show(10, False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "945e787d", + "metadata": {}, + "source": [ + "## A document similarity ranker pipeline\n", + "### The document similarity ranker works downstream of other annotators generating sentence embeddings. In this example we'll use RoBertaSentenceEmbeddings.\n", + "The pipeline will use the following steps:\n", + "- document_assembler to annotate the documents\n", + "- sentence_detector to detect sentences\n", + "- tokenizer to apply tokenization\n", + "- sentence_embeddings to created the necessary sentence embeddings representation\n", + "- document_similarity_ranker to extract the simlar documents via annotator configuration\n", + "- document_similarity_ranker_finisher to extract the column of interest for this new annotator" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d4d2bd1d", + "metadata": {}, + "source": [ + "## DocumentSimilarityRankerApproach: input parameter setters overview\n", + "- setInputCols(\"sentence_embeddings\") : this setter will address input column\n", + "- setOutputCol(\"doc_similarity_rankings\") : this setter will address ouput column\n", + "- setSimilarityMethod(\"brp\") : this setter will select the LSH method (lsh|mh) used to apply approximate nearest neigbours search\n", + "- setNumberOfNeighbours(10) : this setter will address the desired number of similar documents for a given document in the set\n", + "- setBucketLength(2.0) : LSH parameter used to control the average size of hash buckets and improve recall\n", + "- setNumHashTables(3) : LSH parameter used to control number of hash tables used in LSH OR-amplification and improve recall\n", + "- setVisibleDistances(True) : this setter will make distances visible in the result, useful for debugging level information\n", + "- setIdentityRanking(False) : this setter will make identity distance (0.0) visible, useful for debugging level information" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "0b36d5cd", + "metadata": {}, + "source": [ + "## DocumentSimilarityRankerFinisher: out parameters overview\n", + "- setInputCols(\"doc_similarity_rankings\") : this setter will read the result column to extract IDs and distances\n", + "- setOutputCols(\n", + " \"finished_doc_similarity_rankings_id\",\n", + " \"finished_doc_similarity_rankings_neighbors\") : this setter selects the column with the document query ID and the neighbors document that results from the search run" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9a8f9eae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sent_roberta_base download started this may take some time.\n", + "Approximate size to download 284.8 MB\n", + "[ | ]sent_roberta_base download started this may take some time.\n", + "Approximate size to download 284.8 MB\n", + "Download done! Loading the resource.\n", + "[ / ]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-07-01 22:01:11.233544: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ \\ ]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: An illegal reflective access operation has occurred\n", + "WARNING: Illegal reflective access by org.apache.spark.util.SizeEstimator$ (file:/Users/stefanolori/opt/anaconda3/envs/spknlp/lib/python3.8/site-packages/pyspark/jars/spark-core_2.12-3.3.1.jar) to field java.lang.ref.Reference.referent\n", + "WARNING: Please consider reporting this to the maintainers of org.apache.spark.util.SizeEstimator$\n", + "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", + "WARNING: All illegal access operations will be denied in a future release\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[OK!]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "23/07/01 22:01:22 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS\n", + "23/07/01 22:01:22 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------------------------------+------------------------------------------+\n", + "|finished_doc_similarity_rankings_id|finished_doc_similarity_rankings_neighbors|\n", + "+-----------------------------------+------------------------------------------+\n", + "|1510101612 |[(1634839239,0.12448559273510636)] |\n", + "|1634839239 |[(1510101612,0.12448559273510636)] |\n", + "|-612640902 |[(1274183715,0.12201215887654807)] |\n", + "|1274183715 |[(-612640902,0.12201215887654807)] |\n", + "|-1320876223 |[(1293373212,0.17848861258809434)] |\n", + "|1293373212 |[(-1320876223,0.17848861258809434)] |\n", + "|-1548374770 |[(-1719102856,0.2329717161223739)] |\n", + "|-1719102856 |[(-1548374770,0.2329717161223739)] |\n", + "+-----------------------------------+------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "from sparknlp.annotator.similarity.document_similarity_ranker import *\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol(\"text\") \\\n", + " .setOutputCol(\"document\")\n", + "sentence_detector = SentenceDetector() \\\n", + " .setInputCols([\"document\"]) \\\n", + " .setOutputCol(\"sentence\")\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols([\"sentence\"]) \\\n", + " .setOutputCol(\"token\")\n", + "\n", + "sentence_embeddings = RoBertaSentenceEmbeddings.pretrained() \\\n", + " .setInputCols([\"document\"]) \\\n", + " .setOutputCol(\"sentence_embeddings\")\n", + "\n", + "document_similarity_ranker = DocumentSimilarityRankerApproach() \\\n", + " .setInputCols(\"sentence_embeddings\") \\\n", + " .setOutputCol(\"doc_similarity_rankings\") \\\n", + " .setSimilarityMethod(\"brp\") \\\n", + " .setNumberOfNeighbours(1) \\\n", + " .setBucketLength(2.0) \\\n", + " .setNumHashTables(3) \\\n", + " .setVisibleDistances(True) \\\n", + " .setIdentityRanking(False)\n", + "\n", + "document_similarity_ranker_finisher = DocumentSimilarityRankerFinisher() \\\n", + " .setInputCols(\"doc_similarity_rankings\") \\\n", + " .setOutputCols(\n", + " \"finished_doc_similarity_rankings_id\",\n", + " \"finished_doc_similarity_rankings_neighbors\") \\\n", + " .setExtractNearestNeighbor(True)\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " sentence_detector,\n", + " tokenizer,\n", + " sentence_embeddings,\n", + " document_similarity_ranker,\n", + " document_similarity_ranker_finisher\n", + " ])\n", + "\n", + "docSimRankerPipeline = pipeline.fit(data).transform(data)\n", + "# TODO add write/read pipeline\n", + "(\n", + " docSimRankerPipeline\n", + " .select(\n", + " \"finished_doc_similarity_rankings_id\",\n", + " \"finished_doc_similarity_rankings_neighbors\"\n", + " ).show(10, False)\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "54eca293", + "metadata": {}, + "source": [ + "## Result analysis for consistent result confirmation\n", + "#### The test is asserting the initial hypothesis. The documents were created similar in pair: 1-2, 3-4, 5-6, 7-8.\n", + "For instance document 1 and 2 are detected mutually best neighbors at the very same distance respectively:\n", + "- document ID 1510101612 has his best similar document in (1634839239,0.12448559273510636) at distance 0.12448559273510636\n", + "- document ID 1634839239 has his best similar document in (1510101612,0.12448559273510636) at distance 0.12448559273510636\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cde88af", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/sparknlp/annotator/similarity/__init__.py b/python/sparknlp/annotator/similarity/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/sparknlp/annotator/similarity/document_similarity_ranker.py b/python/sparknlp/annotator/similarity/document_similarity_ranker.py new file mode 100644 index 00000000000000..00ba0738be2936 --- /dev/null +++ b/python/sparknlp/annotator/similarity/document_similarity_ranker.py @@ -0,0 +1,232 @@ +# Copyright 2017-2023 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes for DocumentSimilarityRanker.""" + +from sparknlp.common import * +from pyspark import keyword_only +from pyspark.ml.param import TypeConverters, Params, Param +from sparknlp.internal import AnnotatorTransformer + + +class DocumentSimilarityRankerApproach(AnnotatorApproach, HasEnableCachingProperties): + inputAnnotatorTypes = [AnnotatorType.SENTENCE_EMBEDDINGS] + + outputAnnotatorType = AnnotatorType.DOC_SIMILARITY_RANKINGS + + similarityMethod = Param(Params._dummy(), + "similarityMethod", + "The similarity method used to calculate the neighbours. (Default: 'brp', " + "Bucketed Random Projection for Euclidean Distance)", + typeConverter=TypeConverters.toString) + + numberOfNeighbours = Param(Params._dummy(), + "numberOfNeighbours", + "The number of neighbours the model will return (Default:`10`)", + typeConverter=TypeConverters.toInt) + + bucketLength = Param(Params._dummy(), + "bucketLength", + "The bucket length that controls the average size of hash buckets. " + "A larger bucket length (i.e., fewer buckets) increases the probability of features " + "being hashed to the same bucket (increasing the numbers of true and false positives).", + typeConverter=TypeConverters.toFloat) + + numHashTables = Param(Params._dummy(), + "numHashTables", + "number of hash tables, where increasing number of hash tables lowers the " + "false negative rate,and decreasing it improves the running performance.", + typeConverter=TypeConverters.toInt) + + visibleDistances = Param(Params._dummy(), + "visibleDistances", + "Whether to set visibleDistances in ranking output (Default: `false`).", + typeConverter=TypeConverters.toBoolean) + + identityRanking = Param(Params._dummy(), + "identityRanking", + "Whether to include identity in ranking result set. Useful for debug. (Default: `false`).", + typeConverter=TypeConverters.toBoolean) + + def setSimilarityMethod(self, value): + """Sets the similarity method used to calculate the neighbours. + (Default: `"brp"`, Bucketed Random Projection for Euclidean Distance) + + Parameters + ---------- + value : str + the similarity method to calculate the neighbours. + """ + return self._set(similarityMethod=value) + + def setNumberOfNeighbours(self, value): + """Sets The number of neighbours the model will return for each document(Default:`"10"`). + + Parameters + ---------- + value : str + the number of neighbours the model will return for each document. + """ + return self._set(numberOfNeighbours=value) + + def setBucketLength(self, value): + """Sets the bucket length that controls the average size of hash buckets (Default:`"2.0"`). + + Parameters + ---------- + value : float + Sets the bucket length that controls the average size of hash buckets. + """ + return self._set(bucketLength=value) + + def setNumHashTables(self, value): + """Sets the number of hash tables. + + Parameters + ---------- + value : int + Sets the number of hash tables. + """ + return self._set(numHashTables=value) + + def setVisibleDistances(self, value): + """Sets the document distances visible in the result set. + + Parameters + ---------- + value : bool + Sets the document distances visible in the result set. + Default('False') + """ + return self._set(visibleDistances=value) + + def setIdentityRanking(self, value): + """Sets the document identity ranking inclusive in the result set. + + Parameters + ---------- + value : bool + Sets the document identity ranking inclusive in the result set. + Useful for debugging. + Default('False'). + """ + return self._set(identityRanking=value) + + @keyword_only + def __init__(self): + super(DocumentSimilarityRankerApproach, self)\ + .__init__(classname="com.johnsnowlabs.nlp.annotators.similarity.DocumentSimilarityRankerApproach") + self._setDefault( + similarityMethod="brp", + numberOfNeighbours=10, + bucketLength=2.0, + numHashTables=3, + visibleDistances=False, + identityRanking=False + ) + + def _create_model(self, java_model): + return DocumentSimilarityRankerModel(java_model=java_model) + + +class DocumentSimilarityRankerModel(AnnotatorModel, HasEmbeddingsProperties): + + name = "DocumentSimilarityRankerModel" + inputAnnotatorTypes = [AnnotatorType.SENTENCE_EMBEDDINGS] + outputAnnotatorType = AnnotatorType.DOC_SIMILARITY_RANKINGS + + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.similarity.DocumentSimilarityRankerModel", + java_model=None): + super(DocumentSimilarityRankerModel, self).__init__( + classname=classname, + java_model=java_model + ) + + +class DocumentSimilarityRankerFinisher(AnnotatorTransformer): + + inputCols = Param(Params._dummy(), + "inputCols", + "name of input annotation cols containing document similarity ranker results", + typeConverter=TypeConverters.toListString) + outputCols = Param(Params._dummy(), + "outputCols", + "output DocumentSimilarityRankerFinisher output cols", + typeConverter=TypeConverters.toListString) + extractNearestNeighbor = Param(Params._dummy(), "extractNearestNeighbor", + "whether to extract the nearest neighbor document", + typeConverter=TypeConverters.toBoolean) + + name = "DocumentSimilarityRankerFinisher" + + @keyword_only + def __init__(self): + super(DocumentSimilarityRankerFinisher, self).__init__(classname="com.johnsnowlabs.nlp.finisher.DocumentSimilarityRankerFinisher") + self._setDefault( + extractNearestNeighbor=False + ) + + @keyword_only + def setParams(self): + kwargs = self._input_kwargs + return self._set(**kwargs) + + def setInputCols(self, *value): + """Sets name of input annotation columns containing embeddings. + + Parameters + ---------- + *value : str + Input columns for the annotator + """ + + if len(value) == 1 and type(value[0]) == list: + return self._set(inputCols=value[0]) + else: + return self._set(inputCols=list(value)) + + def setOutputCols(self, *value): + """Sets names of finished output columns. + + Parameters + ---------- + *value : List[str] + Input columns for the annotator + """ + + if len(value) == 1 and type(value[0]) == list: + return self._set(outputCols=value[0]) + else: + return self._set(outputCols=list(value)) + + def setExtractNearestNeighbor(self, value): + """Sets whether to extract the nearest neighbor document, by default False. + + Parameters + ---------- + value : bool + Whether to extract the nearest neighbor document + """ + + return self._set(extractNearestNeighbor=value) + + def getInputCols(self): + """Gets input columns name of annotations.""" + return self.getOrDefault(self.inputCols) + + def getOutputCols(self): + """Gets output columns name of annotations.""" + if len(self.getOrDefault(self.outputCols)) == 0: + return ["finished_" + input_col for input_col in self.getInputCols()] + else: + return self.getOrDefault(self.outputCols) \ No newline at end of file diff --git a/python/sparknlp/common/annotator_type.py b/python/sparknlp/common/annotator_type.py index 2d0eb1ed54c9e8..0cd230a5ec480d 100644 --- a/python/sparknlp/common/annotator_type.py +++ b/python/sparknlp/common/annotator_type.py @@ -35,3 +35,4 @@ class AnnotatorType(object): NODE = "node" TABLE = "table" DUMMY = "dummy" + DOC_SIMILARITY_RANKINGS = "doc_similarity_rankings" diff --git a/python/test/annotator/similarity/__init__.py b/python/test/annotator/similarity/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/test/annotator/similarity/doc_similarity_ranker_test.py b/python/test/annotator/similarity/doc_similarity_ranker_test.py new file mode 100644 index 00000000000000..f9a93f4d12ee2d --- /dev/null +++ b/python/test/annotator/similarity/doc_similarity_ranker_test.py @@ -0,0 +1,90 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.annotator.similarity.document_similarity_ranker import * +from sparknlp.base import * +from test.util import SparkSessionForTest + + +@pytest.mark.slow +class DocumentSimilarityRankerTestSpec(unittest.TestCase): + def setUp(self): + self.spark = SparkSessionForTest.spark + + self.data = SparkSessionForTest.spark.createDataFrame([ + ["First document, this is my first sentence. This is my second sentence."], + ["Second document, this is my second sentence. This is my second sentence."], + ["Third document, climate change is arguably one of the most pressing problems of our time."], + ["Fourth document, climate change is definitely one of the most pressing problems of our time."], + ["Fifth document, Florence in Italy, is among the most beautiful cities in Europe."], + ["Sixth document, Florence in Italy, is a very beautiful city in Europe like Lyon in France."], + ["Seventh document, the French Riviera is the Mediterranean coastline of the southeast corner of France."], + ["Eighth document, the warmest place in France is the French Riviera coast in Southern France."] + ]).toDF("text") + + def runTest(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + sentence_detector = SentenceDetector() \ + .setInputCols(["document"]) \ + .setOutputCol("sentence") + tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + + sentence_embeddings = RoBertaSentenceEmbeddings.pretrained() \ + .setInputCols(["document"]) \ + .setOutputCol("sentence_embeddings") + + document_similarity_ranker = DocumentSimilarityRankerApproach() \ + .setInputCols("sentence_embeddings") \ + .setOutputCol("doc_similarity_rankings") \ + .setSimilarityMethod("brp") \ + .setNumberOfNeighbours(10) \ + .setBucketLength(2.0) \ + .setNumHashTables(3) \ + .setVisibleDistances(True) \ + .setIdentityRanking(True) + + document_similarity_ranker_finisher = DocumentSimilarityRankerFinisher() \ + .setInputCols("doc_similarity_rankings") \ + .setOutputCols( + "finished_doc_similarity_rankings_id", + "finished_doc_similarity_rankings_neighbors") \ + .setExtractNearestNeighbor(True) + + pipeline = Pipeline(stages=[ + document_assembler, + sentence_detector, + tokenizer, + sentence_embeddings, + document_similarity_ranker, + document_similarity_ranker_finisher + ]) + + model = pipeline.fit(self.data) + + ( + model + .transform(self.data) + .select("text", + "finished_doc_similarity_rankings_id", + "finished_doc_similarity_rankings_neighbors") + .show(10, False) + ) \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorType.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorType.scala index 6a51a15b9e83cd..7e420f7f65eb43 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorType.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorType.scala @@ -38,5 +38,5 @@ object AnnotatorType { val NODE = "node" val TABLE = "table" val DUMMY = "dummy" - + val DOC_SIMILARITY_RANKINGS = "doc_similarity_rankings" } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/similarity/DocumentSimilarityRankerApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/similarity/DocumentSimilarityRankerApproach.scala new file mode 100644 index 00000000000000..1282303c995815 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/similarity/DocumentSimilarityRankerApproach.scala @@ -0,0 +1,223 @@ +package com.johnsnowlabs.nlp.annotators.similarity + +import com.johnsnowlabs.nlp.AnnotatorType.{DOC_SIMILARITY_RANKINGS, SENTENCE_EMBEDDINGS} +import com.johnsnowlabs.nlp.{AnnotatorApproach, HasEnableCachingProperties} +import com.johnsnowlabs.storage.HasStorageRef +import org.apache.spark.ml.PipelineModel +import org.apache.spark.ml.feature.{ + BucketedRandomProjectionLSH, + BucketedRandomProjectionLSHModel, + MinHashLSH +} +import org.apache.spark.ml.functions.array_to_vector +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.param.{BooleanParam, Param} +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import org.apache.spark.sql.functions.{col, flatten, udf} +import org.apache.spark.sql.{DataFrame, Dataset} + +import scala.util.hashing.MurmurHash3 + +sealed trait NeighborAnnotation { + def neighbors: Array[_] +} + +case class IndexedNeighbors(neighbors: Array[Int]) extends NeighborAnnotation + +case class IndexedNeighborsWithDistance(neighbors: Array[(Int, Double)]) + extends NeighborAnnotation + +case class NeighborsResultSet(result: (Int, NeighborAnnotation)) + +class DocumentSimilarityRankerApproach(override val uid: String) + extends AnnotatorApproach[DocumentSimilarityRankerModel] + with HasEnableCachingProperties { + + override val description: AnnotatorType = "LSH based document similarity annotator" + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + def this() = this(Identifiable.randomUID("DocumentSimilarityRankerApproach")) + + override val inputAnnotatorTypes: Array[AnnotatorType] = Array(SENTENCE_EMBEDDINGS) + + override val outputAnnotatorType: AnnotatorType = DOC_SIMILARITY_RANKINGS + + val LSH_INPUT_COL_NAME = "features" + + val LSH_OUTPUT_COL_NAME = "hashes" + + val INDEX_COL_NAME = "index" + + val DISTANCE = "distCol" + + val INPUT_EMBEDDINGS = "sentence_embeddings.embeddings" + + val TEXT = "text" + + /** The similarity method used to calculate the neighbours. (Default: `"brp"`, Bucketed Random + * Projection for Euclidean Distance) + * + * @group param + */ + val similarityMethod = new Param[String]( + this, + "similarityMethod", + """The similarity method used to calculate the neighbours. + |(Default: `"brp"`, Bucketed Random Projection for Euclidean Distance) + |""".stripMargin) + + def setSimilarityMethod(value: String): this.type = set(similarityMethod, value) + + def getSimilarityMethod: String = $(similarityMethod) + + /** The number of neighbours the model will return (Default:`"10"`). + * + * @group param + */ + val numberOfNeighbours = new Param[Int]( + this, + "numberOfNeighbours", + """The number of neighbours the model will return for each document (Default:`"10"`)""") + + def setNumberOfNeighbours(value: Int): this.type = set(numberOfNeighbours, value) + + def getNumberOfNeighbours: Int = $(numberOfNeighbours) + + val bucketLength = new Param[Double]( + this, + "bucketLength", + """The bucket length that controls the average size of hash buckets. + |A larger bucket length (i.e., fewer buckets) increases the probability of features being hashed + |to the same bucket (increasing the numbers of true and false positives) + |""".stripMargin) + + def setBucketLength(value: Double): this.type = set(bucketLength, value) + + def getBucketLength: Double = $(bucketLength) + + val numHashTables = new Param[Int]( + this, + "numHashTables", + """number of hash tables, where increasing number of hash tables lowers the false negative rate, + |and decreasing it improves the running performance. + |""".stripMargin) + + def setNumHashTables(value: Int): this.type = set(numHashTables, value) + + val visibleDistances = new BooleanParam( + this, + "visibleDistances", + "Whether to set visibleDistances in ranking output (Default: `false`)") + + def setVisibleDistances(value: Boolean): this.type = set(visibleDistances, value) + + def getVisibleDistances: Boolean = $(visibleDistances) + + val identityRanking = new BooleanParam( + this, + "identityRanking", + "Whether to include identity in ranking result set. Useful for debug. (Default: `false`)") + + def setIdentityRanking(value: Boolean): this.type = set(identityRanking, value) + + def getIdentityRanking: Boolean = $(identityRanking) + + setDefault( + similarityMethod -> "brp", + numberOfNeighbours -> 10, + bucketLength -> 2.0, + numHashTables -> 3, + visibleDistances -> false, + identityRanking -> false) + + def getNeighborsResultSet( + query: (Int, Vector), + similarityDataset: DataFrame): NeighborsResultSet = { + + val lsh = $(similarityMethod) match { + case "brp" => + new BucketedRandomProjectionLSH() + .setBucketLength($(bucketLength)) + .setNumHashTables($(numHashTables)) + .setInputCol(LSH_INPUT_COL_NAME) + .setOutputCol(LSH_OUTPUT_COL_NAME) + case "mh" => + new MinHashLSH() + .setNumHashTables($(numHashTables)) + .setInputCol(LSH_INPUT_COL_NAME) + .setOutputCol(LSH_OUTPUT_COL_NAME) + case _ => + throw new IllegalArgumentException(s"${$(similarityMethod)} is not a valid value.") + } + + val model = lsh.fit(similarityDataset) + + query match { + case (index, queryVector) => + val _similarityDataset = + if (getIdentityRanking) { + similarityDataset + } else { + similarityDataset.where(col("index") =!= index) + } + + val similarRankedDocs = + model.approxNearestNeighbors(_similarityDataset, queryVector, getNumberOfNeighbours) + + if (getVisibleDistances) { + val rankedNeighboursWithDistances = similarRankedDocs + .select(INDEX_COL_NAME, DISTANCE) + .collect() + .map(row => (row.getInt(0), row.getDouble(1))) + + NeighborsResultSet((index, IndexedNeighborsWithDistance(rankedNeighboursWithDistances))) + } else { + val rankedNeighbours = similarRankedDocs + .select(INDEX_COL_NAME) + .collect() + .map(_.getInt(0)) + + NeighborsResultSet(index, IndexedNeighbors(rankedNeighbours)) + } + case _ => throw new IllegalArgumentException("query is not of type (Int, DenseVector)") + } + } + + override def train( + dataset: Dataset[_], + recursivePipeline: Option[PipelineModel]): DocumentSimilarityRankerModel = { + + val embeddingsDataset = dataset.withColumn(LSH_INPUT_COL_NAME, col(INPUT_EMBEDDINGS)) + + val similarityDataset: DataFrame = embeddingsDataset + .withColumn(s"$LSH_INPUT_COL_NAME", flatten(col(s"$LSH_INPUT_COL_NAME"))) + .withColumn(s"$LSH_INPUT_COL_NAME", array_to_vector(col(s"$LSH_INPUT_COL_NAME"))) + + val mh3UDF = udf { (s: String) => MurmurHash3.stringHash(s, MurmurHash3.stringSeed) } + + val similarityDatasetWithIndex = + similarityDataset.withColumn(INDEX_COL_NAME, mh3UDF(col(TEXT))) + + val indexedVectorTuples = similarityDatasetWithIndex + .select(INDEX_COL_NAME, LSH_INPUT_COL_NAME) + .rdd + .map(x => (x.getAs[Int](INDEX_COL_NAME), x.getAs[Vector](LSH_INPUT_COL_NAME))) + .collect() + + val similarityMappings: Map[Int, NeighborAnnotation] = indexedVectorTuples + .map(query => getNeighborsResultSet(query, similarityDatasetWithIndex)) + .map(_.result) + .toMap + + new DocumentSimilarityRankerModel() + .setSimilarityMappings(Map("similarityMappings" -> similarityMappings)) + } +} + +/** This is the companion object of [[DocumentSimilarityRankerApproach]]. Please refer to that + * class for the documentation. + */ +object DocumentSimilarityRankerApproach + extends DefaultParamsReadable[DocumentSimilarityRankerApproach] diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/similarity/DocumentSimilarityRankerModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/similarity/DocumentSimilarityRankerModel.scala new file mode 100644 index 00000000000000..eb75d78c7df430 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/similarity/DocumentSimilarityRankerModel.scala @@ -0,0 +1,78 @@ +package com.johnsnowlabs.nlp.annotators.similarity + +import com.johnsnowlabs.nlp.AnnotatorType.{DOC_SIMILARITY_RANKINGS, SENTENCE_EMBEDDINGS} +import com.johnsnowlabs.nlp.embeddings.HasEmbeddingsProperties +import com.johnsnowlabs.nlp.serialization.MapFeature +import com.johnsnowlabs.nlp.{ + Annotation, + AnnotatorModel, + HasSimpleAnnotate, + ParamsAndFeaturesReadable, + ParamsAndFeaturesWritable +} +import com.johnsnowlabs.storage.HasStorageRef +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} +import org.apache.spark.sql.functions.col + +import scala.util.hashing.MurmurHash3 + +class DocumentSimilarityRankerModel(override val uid: String) + extends AnnotatorModel[DocumentSimilarityRankerModel] + with HasSimpleAnnotate[DocumentSimilarityRankerModel] + with HasEmbeddingsProperties + with ParamsAndFeaturesWritable { + + override val inputAnnotatorTypes: Array[AnnotatorType] = Array(SENTENCE_EMBEDDINGS) + + override val outputAnnotatorType: AnnotatorType = DOC_SIMILARITY_RANKINGS + + def this() = this(Identifiable.randomUID("DOC_SIMILARITY_RANKER")) + + /** Dictionary of words with their vectors + * + * @group param + */ + val similarityMappings: MapFeature[String, Map[Int, NeighborAnnotation]] = + new MapFeature(this, "similarityMappings") + + /** @group setParam */ + def setSimilarityMappings(value: Map[String, Map[Int, NeighborAnnotation]]): this.type = + set(similarityMappings, value) + + def getSimilarityMappings: Map[Int, NeighborAnnotation] = + $$(similarityMappings).getOrElse("similarityMappings", Map.empty) + + setDefault(inputCols -> Array(SENTENCE_EMBEDDINGS), outputCol -> DOC_SIMILARITY_RANKINGS) + + /** takes a document and annotations and produces new annotations of this annotator's annotation + * type + * + * @param annotations + * Annotations that correspond to inputAnnotationCols generated by previous annotators if any + * @return + * any number of annotations processed for every input annotation. Not necessary one to one + * relationship + */ + override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = + annotations.map(annotation => { + val inputResult = annotation.result + val targetIndex = MurmurHash3.stringHash(inputResult, MurmurHash3.stringSeed) + val neighborsAnnotation: NeighborAnnotation = + getSimilarityMappings.getOrElse(targetIndex, IndexedNeighbors(Array.empty)) // index NA + + Annotation( + annotatorType = outputAnnotatorType, + begin = annotation.begin, + end = annotation.end, + result = annotation.result, + metadata = annotation.metadata + + ("lshId" -> targetIndex.toString) + + ("lshNeighbors" -> neighborsAnnotation.neighbors.mkString("[", ",", "]")), + embeddings = annotation.embeddings) + }) +} + +trait ReadableDocumentSimilarityRanker + extends ParamsAndFeaturesReadable[DocumentSimilarityRankerModel] + +object DocumentSimilarityRankerModel extends ReadableDocumentSimilarityRanker diff --git a/src/main/scala/com/johnsnowlabs/nlp/finisher/DocumentSimilarityRankerFinisher.scala b/src/main/scala/com/johnsnowlabs/nlp/finisher/DocumentSimilarityRankerFinisher.scala new file mode 100644 index 00000000000000..3aeb7ccb9dd29b --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/finisher/DocumentSimilarityRankerFinisher.scala @@ -0,0 +1,181 @@ +package com.johnsnowlabs.nlp.finisher + +import com.johnsnowlabs.nlp.AnnotatorType +import com.johnsnowlabs.nlp.util.FinisherUtil +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param.{BooleanParam, ParamMap, StringArrayParam} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{IntegerType, StructType} +import org.apache.spark.sql.{DataFrame, Dataset} + +case class DocumentSimilarityRankerFinisher(override val uid: String) + extends Transformer + with DefaultParamsWritable { + + def this() = this(Identifiable.randomUID("DOCUMENT_SIMILARITY_RANKER_FINISHER")) + + val LSH_ID_COL_NAME = "lshId" + + val LSH_NEIGHBORS_COL_NAME = "lshNeighbors" + + val FINISHED_DOC_SIM_RANKER_ID_DEFAULT = "finished_doc_similarity_rankings_id" + + val FINISHED_DOC_SIM_RANKER_NEIGHBORS_DEFAULT = "finished_doc_similarity_rankings_neighbors" + + /** Name of input annotation cols containing embeddings + * + * @group param + */ + val inputCols: StringArrayParam = + new StringArrayParam( + this, + "inputCols", + "Name of input annotation cols containing similar documents") + + /** Name of input annotation cols containing similar documents + * + * @group setParam + */ + def setInputCols(value: Array[String]): this.type = set(inputCols, value) + + /** Name of input annotation cols containing similar documents + * + * @group setParam + */ + def setInputCols(value: String*): this.type = setInputCols(value.toArray) + + /** Name of DocumentSimilarityRankerFinisher output cols + * + * @group getParam + */ + def getInputCols: Array[String] = $(inputCols) + + /** Name of DocumentSimilarityRankerFinisher output cols + * + * @group param + */ + val outputCols: StringArrayParam = + new StringArrayParam( + this, + "outputCols", + "Name of DocumentSimilarityRankerFinisher output cols") + + /** Name of DocumentSimilarityRankerFinisher output cols + * + * @group setParam + */ + def setOutputCols(value: Array[String]): this.type = set(outputCols, value) + + /** Name of DocumentSimilarityRankerFinisher output cols + * + * @group setParam + */ + def setOutputCols(value: String*): this.type = setOutputCols(value.toArray) + + /** Name of input annotation cols containing embeddings + * + * @group getParam + */ + def getOutputCols: Array[String] = get(outputCols).getOrElse(getInputCols.map("finished_" + _)) + + val extractNearestNeighbor: BooleanParam = + new BooleanParam( + this, + "extractNearestNeighbor", + doc = "Extract the best neighbors with distance") + + /** Set flag to extract best neighbor with distance + * + * @group setParam + */ + def setExtractNearestNeighbor(value: Boolean): this.type = set(extractNearestNeighbor, value) + + /** Name of input annotation cols containing embeddings + * + * @group getParam + */ + def getExtractNearestNeighbor: Boolean = $(extractNearestNeighbor) + + setDefault(extractNearestNeighbor -> false) + + override def transform(dataset: Dataset[_]): DataFrame = { + + require( + getOutputCols.length == 1 || getOutputCols.length == 2, + "Output column array should have length 1 (default case) or 2 when value id and neighbors are assigned.") + + val (idColName, neighborsColName) = + getOutputCols.length match { + case 1 => (FINISHED_DOC_SIM_RANKER_ID_DEFAULT, FINISHED_DOC_SIM_RANKER_NEIGHBORS_DEFAULT) + case 2 => (getOutputCols(0), getOutputCols(1)) + } + + val transformed = dataset + .withColumn( + idColName, + element_at(col(s"${AnnotatorType.DOC_SIMILARITY_RANKINGS}.metadata"), 1) + .getItem(LSH_ID_COL_NAME) + .cast("int")) + .withColumn( + neighborsColName, + element_at(col(s"${AnnotatorType.DOC_SIMILARITY_RANKINGS}.metadata"), 1) + .getItem(LSH_NEIGHBORS_COL_NAME)) + + val formatted = transformed + .withColumn( + s"no_squared_$neighborsColName", + regexp_replace(col(neighborsColName), "[\\[\\]]", "")) + .withColumn( + s"tuple_extract_$neighborsColName", + regexp_extract(col(s"no_squared_$neighborsColName"), "\\((.*?)\\)", 0)) + .withColumn( + s"no_rounded_$neighborsColName", + regexp_replace(col(s"tuple_extract_$neighborsColName"), "[\\(\\)]", "")) + + val result = + if (getExtractNearestNeighbor) + formatted + .withColumn( + s"split_$neighborsColName", + split(col(s"no_rounded_$neighborsColName"), ",")) + .withColumn( + "nearest_neighbor_id", + element_at(col(s"split_$neighborsColName"), 1).cast(IntegerType)) + .withColumn("nearest_neighbor_distance", element_at(col(s"split_$neighborsColName"), 2)) + else + formatted + + result.drop( + s"no_squared_$neighborsColName", + s"tuple_extract_$neighborsColName", + s"no_rounded_$neighborsColName", + s"split_$neighborsColName") + } + + override def copy(extra: ParamMap): Transformer = defaultCopy(extra) + + override def transformSchema(schema: StructType): StructType = { + val documentSimilarityRankerAnnotators = Seq(AnnotatorType.DOC_SIMILARITY_RANKINGS) + + getInputCols.foreach { annotationColumn => + FinisherUtil.checkIfInputColsExist(getInputCols, schema) + FinisherUtil.checkIfAnnotationColumnIsSparkNLPAnnotation(schema, annotationColumn) + + /** Check if the annotationColumn has DocumentSimilarityRanker. It must be annotators: + * DocumentSimilarityRanker + */ + require( + documentSimilarityRankerAnnotators.contains( + schema(annotationColumn).metadata.getString("annotatorType")), + s"column [$annotationColumn] must be of type DocumentSimilarityRanker") + } + + val outputFields = schema.fields + + StructType(outputFields) + } +} + +object DocumentSimilarityRankerFinisher + extends DefaultParamsReadable[DocumentSimilarityRankerFinisher] diff --git a/src/test/scala/com/johnsnowlabs/nlp/similarity/DocumentSimilarityRankerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/similarity/DocumentSimilarityRankerTestSpec.scala new file mode 100644 index 00000000000000..ccdd8294db6471 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/similarity/DocumentSimilarityRankerTestSpec.scala @@ -0,0 +1,275 @@ +package com.johnsnowlabs.nlp.similarity + +import com.johnsnowlabs.nlp.AnnotatorType.DOC_SIMILARITY_RANKINGS +import com.johnsnowlabs.nlp.annotators.Tokenizer +import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector +import com.johnsnowlabs.nlp.annotators.similarity.DocumentSimilarityRankerApproach +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.embeddings.{AlbertEmbeddings, SentenceEmbeddings} +import com.johnsnowlabs.nlp.finisher.DocumentSimilarityRankerFinisher +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.nlp.{AnnotatorBuilder, EmbeddingsFinisher} +import com.johnsnowlabs.tags.SlowTest +import org.apache.spark.ml.{Pipeline, PipelineModel} +import org.apache.spark.sql.{SparkSession, functions} +import org.apache.spark.sql.functions.{col, element_at, size} +import org.scalatest.flatspec.AnyFlatSpec + +class DocumentSimilarityRankerTestSpec extends AnyFlatSpec { + val spark: SparkSession = ResourceHelper.spark + + "DocumentSimilarityRanker" should "should use brp to rank document similarity" taggedAs SlowTest in { + + val smallCorpus = spark + .createDataFrame( + List( + "First document, this is my first sentence. This is my second sentence.", + "Second document, this is my second sentence. This is my second sentence.", + "Third document, climate change is arguably one of the most pressing problems of our time.", + "Fourth document, climate change is definitely one of the most pressing problems of our time.", + "Fifth document, Florence in Italy, is among the most beautiful cities in Europe.", + "Sixth document, Florence in Italy, is a very beautiful city in Europe like Lyon in France.", + "Seventh document, the French Riviera is the Mediterranean coastline of the southeast corner of France.", + "Eighth document, the warmest place in France is the French Riviera coast in Southern France.") + .map(Tuple1(_))) + .toDF("text") + + val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val sentence = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + + val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + + val embeddings = AlbertEmbeddings + .pretrained() + .setInputCols("sentence", "token") + .setOutputCol("embeddings") + + val embeddingsSentence = new SentenceEmbeddings() + .setInputCols(Array("document", "embeddings")) + .setOutputCol("sentence_embeddings") + .setPoolingStrategy("AVERAGE") + + val sentenceFinisher = new EmbeddingsFinisher() + .setInputCols("sentence_embeddings") + .setOutputCols("finished_sentence_embeddings") + .setCleanAnnotations(false) + + val docSimilarityRanker = new DocumentSimilarityRankerApproach() + .setInputCols("sentence_embeddings") + .setOutputCol(DOC_SIMILARITY_RANKINGS) + .setSimilarityMethod("brp") + .setNumberOfNeighbours(3) + .setVisibleDistances(true) + .setIdentityRanking(true) + + val documentSimilarityFinisher = new DocumentSimilarityRankerFinisher() + .setInputCols("doc_similarity_rankings") + .setOutputCols( + "finished_doc_similarity_rankings_id", + "finished_doc_similarity_rankings_neighbors") + .setExtractNearestNeighbor(true) + + val pipeline = new Pipeline() + .setStages( + Array( + documentAssembler, + sentence, + tokenizer, + embeddings, + embeddingsSentence, + sentenceFinisher, + docSimilarityRanker, + documentSimilarityFinisher)) + + val trainedPipelineModel = pipeline.fit(smallCorpus) + + val pipelineModelLoc = "./tmp_doc_sim_ranker_brp_pipeline" + trainedPipelineModel.write.overwrite().save(pipelineModelLoc) + val pipelineModel = PipelineModel.load(pipelineModelLoc) + + val transformed = pipelineModel.transform(smallCorpus) + + transformed.select("text", "finished_sentence_embeddings").show() + + // correct if not empty as inclusive query points are at distance 0.0 from themselves + assert(!transformed.where(col("nearest_neighbor_distance") === 0.0).rdd.isEmpty() == true) + } + + "DocumentSimilarityRanker" should "should use min hash to rank document similarity" taggedAs SlowTest in { + + val smallCorpus = spark + .createDataFrame( + List( + "First document, this is my first sentence. This is my second sentence.", + "Second document, this is my second sentence. This is my second sentence.", + "Third document, climate change is arguably one of the most pressing problems of our time.", + "Fourth document, climate change is definitely one of the most pressing problems of our time.", + "Fifth document, Florence in Italy, is among the most beautiful cities in Europe.", + "Sixth document, Florence in Italy, is a very beautiful city in Europe like Lyon in France.", + "Seventh document, the French Riviera is the Mediterranean coastline of the southeast corner of France.", + "Eighth document, the warmest place in France is the French Riviera coast in Southern France.") + .map(Tuple1(_))) + .toDF("text") + + val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val sentence = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + + val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + + val embeddings = AlbertEmbeddings + .pretrained() + .setInputCols("sentence", "token") + .setOutputCol("embeddings") + + val embeddingsSentence = new SentenceEmbeddings() + .setInputCols(Array("document", "embeddings")) + .setOutputCol("sentence_embeddings") + .setPoolingStrategy("AVERAGE") + + val sentenceFinisher = new EmbeddingsFinisher() + .setInputCols("sentence_embeddings") + .setOutputCols("finished_sentence_embeddings") + .setCleanAnnotations(false) + + val docSimilarityRanker = new DocumentSimilarityRankerApproach() + .setInputCols("sentence_embeddings") + .setOutputCol(DOC_SIMILARITY_RANKINGS) + .setSimilarityMethod("mh") + .setNumberOfNeighbours(3) + .setVisibleDistances(true) + .setIdentityRanking(true) + + val documentSimilarityFinisher = new DocumentSimilarityRankerFinisher() + .setInputCols("doc_similarity_rankings") + .setOutputCols( + "finished_doc_similarity_rankings_id", + "finished_doc_similarity_rankings_neighbors") + .setExtractNearestNeighbor(true) + + val pipeline = new Pipeline() + .setStages( + Array( + documentAssembler, + sentence, + tokenizer, + embeddings, + embeddingsSentence, + sentenceFinisher, + docSimilarityRanker, + documentSimilarityFinisher)) + + val trainedPipelineModel = pipeline.fit(smallCorpus) + + val pipelineModelLoc = "./tmp_doc_sim_ranker_mh_pipeline" + trainedPipelineModel.write.overwrite().save(pipelineModelLoc) + val pipelineModel = PipelineModel.load(pipelineModelLoc) + + val transformed = pipelineModel.transform(smallCorpus) + + // correct if not empty as inclusive query points are at distance 0.0 from themselves + assert(!transformed.where(col("nearest_neighbor_distance") === 0.0).rdd.isEmpty() == true) + } + + "Databricks pipeline" should "should use min hash to rank document similarity" taggedAs SlowTest in { + import com.johnsnowlabs.nlp.AnnotatorType.DOC_SIMILARITY_RANKINGS + import com.johnsnowlabs.nlp.annotators.Tokenizer + import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector + import com.johnsnowlabs.nlp.annotators.similarity.DocumentSimilarityRankerApproach + import com.johnsnowlabs.nlp.base.DocumentAssembler + import com.johnsnowlabs.nlp.embeddings.{AlbertEmbeddings, SentenceEmbeddings} + import com.johnsnowlabs.nlp.finisher.DocumentSimilarityRankerFinisher + import com.johnsnowlabs.nlp.util.io.ResourceHelper + import com.johnsnowlabs.nlp.EmbeddingsFinisher + import org.apache.spark.ml.{Pipeline, PipelineModel} + + val smallCorpus = spark + .createDataFrame( + List( + "First document, this is my first sentence. This is my second sentence.", + "Second document, this is my second sentence. This is my second sentence.", + "Third document, climate change is arguably one of the most pressing problems of our time.", + "Fourth document, climate change is definitely one of the most pressing problems of our time.", + "Fifth document, Florence in Italy, is among the most beautiful cities in Europe.", + "Sixth document, Florence in Italy, is a very beautiful city in Europe like Lyon in France.", + "Seventh document, the French Riviera is the Mediterranean coastline of the southeast corner of France.", + "Eighth document, the warmest place in France is the French Riviera coast in Southern France.") + .map(Tuple1(_))) + .toDF("text") + + val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val sentence = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + + val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + + val embeddings = AlbertEmbeddings + .pretrained() + .setInputCols("sentence", "token") + .setOutputCol("embeddings") + + val embeddingsSentence = new SentenceEmbeddings() + .setInputCols(Array("document", "embeddings")) + .setOutputCol("sentence_embeddings") + .setPoolingStrategy("AVERAGE") + + val sentenceFinisher = new EmbeddingsFinisher() + .setInputCols("sentence_embeddings") + .setOutputCols("finished_sentence_embeddings") + .setCleanAnnotations(false) + + val docSimilarityRanker = new DocumentSimilarityRankerApproach() + .setInputCols("sentence_embeddings") + .setOutputCol(DOC_SIMILARITY_RANKINGS) + .setSimilarityMethod("brp") + .setNumberOfNeighbours(3) + .setVisibleDistances(true) + .setIdentityRanking(true) + + val documentSimilarityFinisher = new DocumentSimilarityRankerFinisher() + .setInputCols("doc_similarity_rankings") + .setOutputCols( + "finished_doc_similarity_rankings_id", + "finished_doc_similarity_rankings_neighbors") + .setExtractNearestNeighbor(true) + + val pipeline = new Pipeline() + .setStages( + Array( + documentAssembler, + sentence, + tokenizer, + embeddings, + embeddingsSentence, + sentenceFinisher, + docSimilarityRanker, + documentSimilarityFinisher)) + + val transformed = pipeline.fit(smallCorpus).transform(smallCorpus) + + transformed + .select("text", "sentence_embeddings.embeddings") + .withColumn("extracted_embeddings", element_at(col("embeddings"), 1)) + .withColumn("embeddings_size", size(col("extracted_embeddings"))) + .show(10, false) + } +} From ea4b5666498f0aa2bba3e6cb76bce354b90a3d97 Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Mon, 3 Jul 2023 17:15:51 +1000 Subject: [PATCH 10/13] Added notebooks (#13874) --- .../sentence-embeddings/E5Embeddings.ipynb | 238 ++++++++++++++++++ .../InstructorEmbeddings.ipynb | 202 +++++++++++++++ 2 files changed, 440 insertions(+) create mode 100644 examples/python/annotation/text/english/sentence-embeddings/E5Embeddings.ipynb create mode 100644 examples/python/annotation/text/english/sentence-embeddings/InstructorEmbeddings.ipynb diff --git a/examples/python/annotation/text/english/sentence-embeddings/E5Embeddings.ipynb b/examples/python/annotation/text/english/sentence-embeddings/E5Embeddings.ipynb new file mode 100644 index 00000000000000..7b1b39ce5ac93b --- /dev/null +++ b/examples/python/annotation/text/english/sentence-embeddings/E5Embeddings.ipynb @@ -0,0 +1,238 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/sentence-embeddings/E5Embeddings.ipynb)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Colab Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "from sparknlp.base import *\n", + "from sparknlp.common import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline\n", + "import pandas as pd\n", + "\n", + "# Comment out this line and uncomment the next one to enable GPU mode and High RAM\n", + "# \n", + "spark = sparknlp.start()\n", + "\n", + "# spark = sparknlp.start(gpu=True)\n", + "\n", + "print(\"Spark NLP version\", sparknlp.version())\n", + "print(\"Apache Spark version:\", spark.version)\n", + "\n", + "spark" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Download E5Embedding Model and Create Spark NLP Pipeline" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lets create a Spark NLP pipeline with the following stages:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol(\"text\") \\\n", + " .setOutputCol(\"documents\")\n", + "\n", + "instruction = E5Embeddings.pretrained(name='e5_small', lang='en') \\\n", + " .setInputCols([\"documents\"]) \\\n", + " .setOutputCol(\"e5\")\n", + "\n", + "# Build pipeline with BART\n", + "pipe_components = [document_assembler, instruction]\n", + "pipeline = Pipeline().setStages( pipe_components)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lets create a dataframe with some queries and passages to be used as input for the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "data = spark.createDataFrame([\n", + " [1, \"query: how much protein should a female eat\"],\n", + " [2, \"query: summit define\"],\n", + " [3, \"passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 \"\n", + " \"is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're \"\n", + " \"expecting or training for a marathon. Check out the chart below to see how much protein you should \"\n", + " \"be eating each day.\", ],\n", + " [4, \"passage: Definition of summit for English Language Learners. : 1 the highest point of a mountain :\"\n", + " \" the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the \"\n", + " \"leaders of two or more governments.\"]\n", + " ]).toDF(\"id\", \"text\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the pipeline and get the embeddings." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 0:> (0 + 1) / 1]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|embeddings |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[[8.0190634E-4, -0.0059748273, -0.07287591, 0.007944053, 0.02605933, -0.008906806, -0.06441701, 0.024911521, -0.047772937, 0.05289171, -0.07640483, -0.026668454, 0.016677942, -0.052422184, 0.010135774, -0.045901768, 0.07052123, 0.025814291, 0.015971916, -0.0044898214, -0.056919493, -0.0031456274, 0.01570508, 0.071191914, 0.04829567, -0.027908832, -0.0041969656, -0.0012053477, -0.033321492, -0.034122076, -0.003202453, -0.05816862, -0.018836519, 0.0070868484, 7.498736E-4, -0.01372588, -0.016611278, -0.018659577, 0.055038612, 0.032351818, -0.039988246, 0.02073053, 0.043623917, 0.031571534, 0.014883104, -0.052890092, 0.03743348, 0.04072233, -0.052592196, 0.10222303, 0.025196373, -0.017820569, 0.0064059016, 0.036609165, -0.0932597, -0.08415129, -0.023488384, -0.034358907, -0.017385254, 0.026927387, 0.09005304, -0.036002014, -0.005586472, -0.008976968, -0.046990536, 0.02584159, -0.065852605, 0.04310441, -0.03972358, 0.05337572, -0.020093162, 0.07993953, -0.050127633, -0.08322826, -0.021044549, 0.0036180215, -0.036280263, 0.03565733, 0.08276233, -0.07065905, 0.0844608, -0.038433485, -0.047383163, -0.06802774, 0.024673589, 0.0041557793, 0.06292143, 0.039271303, -0.034774322, 0.060959317, 0.061578527, -0.02743444, -0.05438446, 0.06919637, -0.045766305, -0.0017009391, 0.016559575, -0.07148739, 0.04352766, 0.057605024, -0.023092218, 0.077176414, -0.021328526, -0.016377728, -0.021664856, 0.0012897812, -0.05495426, 0.06665424, 0.031768065, -0.0013049082, -0.048237618, 0.055276737, -0.064812966, -0.063992724, -0.03606062, 0.048113704, 0.05724181, 0.049102053, -0.055841904, -0.06616128, -0.035773326, -0.05784446, 0.011544121, -0.05832887, 0.04989961, -0.05320584, 0.035013728, -0.023854692, -0.05075947, -0.06416691, 0.107668765, -0.10972186, -0.0065452163, 0.023789326, -0.042432584, -0.040789988, -0.06454592, -0.024564223, 0.05244426, 0.08527635, 0.07357475, 0.028319698, 0.0073429323, -0.04473503, -0.010060713, -0.08238728, 0.06350935, -0.0041905553, -0.032287933, 0.016135307, -0.07279008, -0.060838483, -0.075661935, -0.018997295, -0.10674084, -0.038688067, -0.041968126, 0.044574235, -0.013926045, -0.04399444, -0.061030895, 0.01876777, 0.2015332, 0.04924441, -0.06428397, 0.0055048694, 0.045243688, 0.019879399, -0.11607197, -0.06309459, 0.07130344, -0.066527866, 0.055443194, 0.059413664, 0.047937185, -0.04706782, 0.042067964, 0.045315195, -0.018378934, 0.059289962, 0.058545336, 0.077677235, -0.053936813, -0.068784796, -0.01122504, -0.05577791, -0.0597581, -0.085670926, -0.024510825, -0.022879539, 0.04153722, -0.017755607, -0.008197127, 0.04365088, 0.060591064, 0.04047161, -0.0713752, -0.1002828, -0.049343295, -0.03154564, -0.069920175, 0.06718735, -0.04267392, -0.06830617, -0.04010028, -0.029068153, 0.005669829, -0.046834853, 0.016869524, -0.018486049, -0.05707121, 0.0051633087, -0.011561127, -0.06641371, -0.045334544, -0.029238867, 0.050093777, -0.012342574, -0.052549604, -0.021395944, 0.11013109, 0.059675455, -0.03147869, -0.036708128, -0.025666263, -0.029993327, 0.057324022, -0.03827542, 0.055800788, -0.08110101, 0.04913242, 0.06504735, 5.7642825E-4, -0.026496444, 0.104300454, -0.027610747, 0.06303539, -0.043199394, -0.032151833, 0.0081282295, -0.0869248, -0.03128344, 0.008973062, -0.028620848, 0.014023963, 0.15582071, 0.01409474, -0.015195647, 0.03788257, -0.11051873, -0.11961075, 0.0027712767, 0.021449607, -0.06306116, -0.0100019695, 0.02859641, 0.051174663, -0.019328658, 0.036077373, -0.00900092, -0.05123872, 0.031075655, 0.09434415, -0.0040413775, 0.04011734, 0.013343716, 0.010123648, 0.048147667, -0.0048306864, -0.039672565, -0.016133113, -0.040045585, 0.12709293, -0.078152575, -0.009307059, 0.04066555, -0.015656175, 0.038391676, 0.03189456, -0.04254065, -0.061922066, 0.0090806, -0.0060465243, 0.056813955, -0.024268918, -0.0151320025, -0.07785242, 0.03586879, 0.051517483, 0.037140526, -0.03183444, -0.04581985, 0.020517277, 0.04279628, 0.043966606, -0.0071221977, -0.017498031, -0.086210534, 0.06676437, 0.0045599993, -0.041278265, -0.04839848, -0.011422011, -0.05190244, 0.03776515, 0.07570742, 0.015314608, 0.01837487, -0.02073799, -0.013036853, -0.008524046, -0.07346627, 0.045172296, -0.07489482, -0.04293508, 0.025039628, 0.038624883, -0.022110004, 0.055056587, 0.08392655, -0.08887726, 0.07680922, -0.033238426, 0.05364916, -0.06270089, 0.05340719, -0.025886936, 0.022084506, 0.005077468, -0.0015440907, 0.08324973, 0.06370375, -0.0594707, -0.038611326, 0.014417709, -0.002274329, 0.08430467, 0.0264819, 0.05033191, -0.05418442, -0.055640634, -0.067436405, -0.06329774, 0.026380884, -0.0109146545, -0.018956918, 0.02125892, -4.7127393E-4, 0.034171373, -0.04507203, 0.017495591, 0.06305414, -0.032933876, 0.1742396, 0.0081197405, -0.033936664, 0.050897475, 0.023332221, 0.06703659, 0.019969575, 0.075151876, -0.0029269294, 0.027489392, -0.027051851, 0.047261342, -0.02811436, -0.056598973, -0.023893198, 0.025900016, 0.09682621, 0.07884969, 0.08860462, 0.04816566, -0.0067815683, 0.01798487, -0.062052652, 0.03110704, 0.016086599, -0.013651433, -0.038989987, 0.006860349, -0.027243046, -3.3595285E-4, 0.047972273]]|\n", + "|[[0.050514217, 0.010061959, -0.043401722, -0.020937217, 0.05170227, 0.0115785785, -0.014852718, 0.05612109, -0.03891183, 0.066299364, -0.03515178, -0.022517513, 0.08408398, -0.027083544, 0.00433425, -0.06589884, 0.04857633, 0.055115573, -0.005459747, -0.012708587, -0.064947695, -0.03886527, 0.045916263, 0.050418455, 0.016092516, -0.006044648, -0.027524851, -0.018189956, 0.06563501, -0.016984938, -0.0093430495, 0.009116089, -0.042847715, 0.005613071, -0.01928321, 0.026421025, -0.012707652, 0.010813174, 0.02068017, 0.056153208, -0.039185297, 0.008094586, 0.04619643, 0.0022446273, -0.049354173, -0.013162017, 0.07299674, 0.025019942, -0.011265003, 0.06276216, 0.0218438, -0.052990578, 0.0725218, 0.02567762, -0.061442886, -0.07660899, -0.03868493, -0.055644576, -0.019055603, 0.045566455, 0.02512236, -0.015899707, 0.0038913572, -0.0028755201, -0.048972517, 0.0106449295, -0.080716796, 0.012702321, -0.033552606, -0.003669624, -0.019061198, 0.09723396, -0.035534613, -0.07781177, 0.006672188, 0.062150657, -0.056711737, 0.014202608, 0.007152448, -0.09840831, 0.076315306, -0.027380891, -0.022671487, -0.098281324, -0.012208779, 0.0632356, -0.002242904, 0.01920342, -0.024914894, 0.04109726, 0.02734253, -0.051778194, -0.062881164, 0.02995512, -0.0171538, 0.013697907, -0.04104599, -0.011706119, 0.041975725, 0.029355679, -0.02188073, 0.052991357, -0.02743504, -0.058533102, -0.03923842, 0.022931447, 0.009424359, 0.03763383, 0.06548723, -0.051603366, -0.043681312, 0.022294188, 0.022880858, -0.06545931, 0.0019425638, 0.04528954, 0.07749907, -0.015015639, -0.04360742, -0.10555554, -0.03157314, -0.053837344, 0.045017723, -0.030513618, 0.06031093, -0.10449226, 0.0415827, -0.044642642, -0.033933233, -0.07488847, 0.049433526, -0.09393301, 0.030108633, 0.041681737, -0.08126291, -0.06484465, -0.015989598, -0.028418396, 0.025663346, 0.061176013, 0.034931734, 0.047142264, 0.0056433897, -0.06672263, -0.04455662, -0.08459777, -0.0019657516, -0.015161641, -0.059077278, -0.007999661, -0.008048766, -0.042218197, -0.018418107, -0.07400224, -0.12236316, -0.026515381, -0.0011941215, 0.06703735, -0.00760375, -0.11171125, -0.0551678, 0.022154907, 0.22849056, 0.029250056, -0.067700475, -0.0017139703, 0.05854688, 0.08612244, -0.082081825, -0.07523155, 0.08136499, -5.4742134E-4, 0.0042714295, -0.002481182, 0.085152715, -0.033430193, 0.07375195, 0.03320663, -0.024130106, 0.1060803, 0.03684335, 0.042666905, -0.050768342, -0.02783311, -0.006387174, -0.07549141, -0.07715287, -0.068362825, -0.024896825, -0.02814667, 0.027417168, -0.028032528, 0.0050684167, -0.005591774, 0.063782334, 0.026629169, -0.05545405, -0.08543345, -0.050796673, -0.046679914, -0.051220834, 0.099881895, -0.023391146, -0.034302153, -0.013100829, -0.015240464, 0.022707092, -0.023512404, -0.007161643, -0.044616815, -0.027834525, 0.008097733, 0.015200686, -0.11712076, -0.01660366, -0.04103086, 0.051542647, -0.02790766, -0.07317415, 0.028048566, 0.13280456, 0.013716013, -0.010398302, -0.05675694, -0.014644403, 2.5537412E-4, 0.05514908, -0.10900069, 0.029234862, -0.08227975, 0.016024971, 0.05632115, 0.041446786, -0.036259755, 0.05273922, 0.026686776, 0.057992097, -0.019857157, -0.06617782, 0.09076169, -0.10972492, -0.052146573, -0.026995018, -0.010368913, -8.0928305E-4, 0.10003201, 0.011824987, -0.022997612, 0.119170494, -0.023938997, -0.051829938, 0.051099405, 0.032705408, -0.09184957, -0.03525483, 0.08736037, 0.050133426, -0.0477982, 0.024231652, -0.050176796, -0.053517193, 0.0499123, 0.052419346, -0.052290242, 0.047902226, -0.020742523, 0.015861081, 0.08981698, 0.05756444, -0.070918605, -0.056598328, -0.06001114, 0.087662145, -0.09793287, 0.030500438, 0.029456677, -0.05837451, 0.018903349, 0.041268695, -0.041667357, -0.045342673, 0.067572854, -0.004576681, 0.041296944, -0.026670571, 0.0029871026, -0.06921306, 0.011607438, 0.076620206, 0.03372765, -0.018414993, -0.036737546, 0.035903525, 0.07672415, 0.030570542, 0.05466485, -0.0082032, -0.01306696, 0.019325504, -0.021878157, 0.01271223, -0.0789027, -0.0424194, -0.041703302, 0.01355396, 0.02753331, -0.011476896, 0.069562174, -0.027159018, -0.055886365, 0.05469894, -0.059596337, 0.03306851, -0.019187255, -0.01625494, 0.015604595, 0.031338688, -0.035671216, 0.05984197, -0.030811163, -0.051394884, 0.072057985, -0.04720446, 0.051544618, -0.040733535, 0.06284145, -0.0011441609, 0.059360154, 0.03301165, -0.034525722, 0.067794636, 0.046829447, 0.021751009, -0.038297232, 0.010656714, 0.026854765, 0.07068686, 0.03197915, 0.013219161, -0.10968943, -0.05712275, 0.006463353, -0.07348946, -0.0012144869, 0.045827057, 0.023154305, 0.04130954, -0.027405972, -0.0054149725, -0.0643708, 0.0700982, 0.058635253, 0.0041305637, 0.13612802, -0.055352326, -0.05760716, -0.012670471, -0.025720399, 0.03318834, 0.05959147, 0.059002377, 0.0077214823, 0.03015882, -0.0024237563, 0.04185202, -0.02745781, -0.089868926, 0.012648689, 0.05107443, 0.11826391, -0.01603056, 0.024523359, 0.04260159, -0.010374576, -0.0017565754, -0.0752423, -0.012965751, -0.025219003, -0.06922553, -0.0084774075, -0.016652124, -9.014326E-4, -3.440816E-4, 0.08442783]] |\n", + "|[[0.018989779, -0.009931463, -0.050909195, -0.010832583, 0.011106625, -0.017240994, -0.0559874, 0.018495563, -0.034127317, 0.06253831, -0.07972209, -0.02549596, 0.031011619, -0.081661075, 0.017395245, -0.038124923, 0.08653003, 0.06302286, 0.0038844212, -0.020395814, -0.06674082, 0.030541096, 0.01216777, 0.03968276, 0.06084151, -0.03332047, -0.017171964, 0.028372508, -0.024995623, -0.04180733, -0.009422024, -0.06639663, -0.038453206, 8.3635526E-4, -9.99833E-4, -0.019106021, -0.012258235, -0.021185735, 0.045642756, 0.025936848, -0.06591746, -0.006214973, 0.04268613, 0.055423982, 0.010420288, -0.02512154, 0.05639567, 0.020754565, -0.04829447, 0.09239937, 0.022932816, 0.008693588, 0.009368283, 0.050366987, -0.06287823, -0.054770287, -0.013346733, -0.04746888, -0.0074498267, 0.021988954, 0.07340651, -0.03760947, 0.003974858, -0.020122679, -0.055294994, 0.022968244, -0.062032107, 0.019510848, -0.022021467, 0.079132564, 0.013245848, 0.08528002, -0.016532052, -0.09306296, -0.028115297, 0.030017799, -0.05151452, 0.024326546, 0.051413383, -0.083100945, 0.07114094, -0.059322506, -0.057029177, -0.05130371, -0.010442242, 0.018790083, 0.05962069, 0.05905749, -0.0712944, 0.015173576, 0.054719307, -0.044992708, -0.056201853, 0.07763705, -0.028188348, -0.02029475, 0.017582111, -0.08137457, 0.035103094, 0.025138754, -0.017248789, 0.09958034, -0.019970058, -0.03563139, -0.029508667, 0.004593335, -0.06871349, 0.073688105, 0.03003573, -0.02918909, -0.04548093, 0.06078114, -0.049574636, -0.04874967, -0.03752621, 0.050473005, 0.074096665, 0.054224998, -0.06888124, -0.08190637, -0.04870275, -0.09114179, 0.01400552, -0.05940734, 0.042924393, -0.049026098, 0.05728074, -0.028521553, -0.038617153, -0.08880101, 0.047092505, -0.12807734, -9.1031595E-4, 0.030808274, -0.057306223, -0.013562461, -0.047038995, -0.036410064, 0.025049737, 0.050513744, 0.07019907, 0.00809601, -1.01718266E-4, -0.03968192, -0.008264309, -0.06180306, 0.044833884, 0.005990344, -0.048021752, 0.018896868, -0.03872897, -0.06387793, -0.06162192, -9.621203E-4, -0.12506318, -0.0417601, -0.04132599, 0.0628325, -0.040630743, -0.041020535, -0.08615846, 0.040007953, 0.19199139, 0.007226425, -0.07060595, -0.005491781, 0.061921753, 0.025157189, -0.110942125, -0.049719993, 0.062280767, -0.047451444, 0.07468437, 0.03945548, 0.050246492, -0.027849613, 0.05124267, 0.044265877, 0.0047834865, 0.07973868, 0.027062217, 0.064963534, -0.049567778, -0.06755528, -0.046254385, -0.04979987, -0.037433427, -0.09266982, -0.017109558, -0.0136375185, 0.052482836, -0.029847074, -0.019559506, 0.052258752, 0.07243555, 0.034701124, -0.08223566, -0.09450027, -0.08830127, -0.029281527, -0.07216679, 0.04915156, -0.048411284, -0.08750839, -0.03939493, 5.562757E-4, 0.012049733, -0.024479948, -0.016674964, -0.026647888, -0.04910277, 0.02383224, -0.0058609936, -0.02324343, -0.01801144, -0.035984464, 0.048771355, -0.022314042, -0.07400442, 0.017883742, 0.12498806, 0.031762164, -0.071550824, -0.02863895, -0.018804863, -0.041136466, 0.050331898, -0.027568335, 0.0727583, -0.09941543, 0.045421813, 0.103788, 0.01559907, -0.03126825, 0.046194393, -0.09395668, 0.08638349, -0.045515187, -0.032807473, 0.0022439312, -0.054160476, -0.007090178, 0.022789832, -0.045629043, 0.013132213, 0.09803767, 0.0280078, -0.027132602, 0.030960932, -0.11726305, -0.10303315, -0.0154676875, 0.023656018, -0.060617216, 0.013450652, 0.026684277, 0.0310426, -0.044291858, 0.06987808, -0.011989069, -0.039249282, 0.032854047, 0.067537114, -3.369161E-4, 0.08212295, 0.031631075, 0.0015520776, 0.059880685, 0.006885123, -0.04457581, -0.012465529, -0.048568774, 0.1162519, -0.08501214, 0.029425448, 0.03398319, -0.0039973576, 0.030593025, 0.036897883, -0.0062118354, -0.07224062, 0.0031395212, 0.0038746952, 0.024435565, -0.022936258, -0.027715886, -0.019195855, 0.015369002, 0.022729047, 0.0056728534, -0.014643929, -0.052434333, 0.027552877, 0.044112675, 0.040135365, 0.016500462, -0.02451588, -0.06611075, 0.07003415, -0.0058244015, -0.025429213, -0.057742596, 0.0037730767, -0.055965245, 0.050414115, 0.049998917, 0.024162255, 0.015452598, -0.026186533, 0.012543057, 0.011794361, -0.06529411, 0.049729694, -0.08251195, -0.07674231, 0.023079263, 0.043492332, -0.05113144, 0.0438487, 0.08488436, -0.06955417, 0.0628578, 0.0011058371, 0.052122444, -0.053940404, 0.04632826, -0.03288585, 0.04154949, 0.011391672, -0.006716721, 0.0925415, 0.05963683, -0.07954907, -0.021309748, 0.038165532, 0.011549542, 0.087717935, 0.044547625, 0.04617844, -0.07405831, -0.024325445, -0.007804961, -0.066497855, 0.02967101, 0.011463103, -0.004120417, 0.0059474227, 8.325438E-4, 0.021602597, -0.005412865, 0.024981422, 0.061426077, 0.009199549, 0.14440517, -0.0070653805, -0.028356971, 0.080376565, 0.053036623, 0.06540423, 0.01072136, 0.07313401, -0.009076663, 0.02762415, -0.033449207, 0.034014855, -0.037617683, -0.097210184, -0.0069066463, 0.013069363, 0.104766816, 0.08332489, 0.09734244, 0.060438484, -0.0059225005, -0.003404502, -0.054851856, 0.053239256, 0.019848721, -0.015885731, -0.04090939, 0.0072429604, -0.06212979, -1.6652368E-4, 0.037070617]] |\n", + "|[[0.008951451, 0.002047678, -0.027728232, -0.017447572, 0.049190223, -0.022217216, -0.018645568, 0.06026955, -0.035538357, 0.057616595, -0.0487033, -0.046217814, 0.071691394, -0.03556469, 0.032665122, -0.051920988, 0.05123163, 0.06886265, -0.003531419, -0.019638332, -0.06169719, -0.033530474, 0.06452985, 0.041696943, 0.033638723, -0.022732656, -0.027154164, -0.007671356, 0.053003345, -0.04367331, -0.013476425, 0.005586995, -0.057579186, 0.006271123, -0.016048357, 0.034765303, -0.04760142, -0.007917086, 0.0044548926, 0.05636264, -0.06905147, 0.0018436982, 0.054839056, -0.014734278, -0.044486802, -0.0074577653, 0.07341106, 0.014857702, 0.0011825216, 0.05730089, 0.051789224, -0.042031024, 0.026419705, 0.041389517, -0.05964582, -0.044647247, -8.817892E-4, -0.05458213, -0.0066856495, 0.06568387, 0.043173138, 0.0066682324, 0.0054201046, 0.03389725, -0.045257997, 0.017640809, -0.07852259, 0.030879328, -0.05024359, 0.01068829, -0.031836793, 0.09884748, -0.04450384, -0.082050465, 4.5893074E-5, 0.06456313, -0.062139608, 0.0059877853, 0.016613565, -0.08869805, 0.077740796, 0.00498475, -0.03943852, -0.0602428, 0.017112631, 0.06206382, 0.010525039, 0.043788075, -0.04094458, 0.0194606, 0.03270265, -0.040276777, -0.059226595, 0.0087456135, 0.021835225, 0.021389892, -0.021046853, -0.0068812706, 0.046433244, 0.0050701206, -0.01448807, 0.049041413, -0.015509655, -0.041719858, -0.041372087, 0.02489313, 0.009438485, 0.05029621, 0.032169465, -0.06432967, -0.05155386, 0.05867298, 0.019229677, -0.0405438, -0.010436025, 0.07387345, 0.06474686, -0.028284362, -0.05571893, -0.10431585, 0.009474407, -0.101009235, 0.044456538, -0.04371087, 0.063253015, -0.11769237, 0.011732325, -0.054858364, -0.01408751, -0.07618674, 0.027529223, -0.10366285, 0.051327813, 0.05379098, -0.07009311, -0.07012369, -0.016770953, -0.028461618, 0.043643307, 0.019657757, 0.05522064, 0.061367143, 0.009086249, -0.07460865, -0.06814743, -0.089532346, -0.00792504, -0.029181998, -0.048020236, 0.022176905, -0.016383486, -0.03300748, -0.033763863, -0.056937434, -0.1336115, -0.038821433, 0.023206739, 0.06723952, -0.045146707, -0.076787725, -0.07662455, 0.007027287, 0.1514399, 0.0053828955, -0.06116311, -0.019069852, 0.051458195, 0.10299531, -0.070478335, -0.056817204, 0.06759626, 0.018095188, 0.01604008, -0.0021822506, 0.05780667, -0.04178118, 0.030748717, 0.01856922, -0.01934128, 0.069452904, -0.0040468485, 0.03906673, -0.08489104, -0.0337089, -0.012383665, -0.072865576, -0.044546783, -0.09617078, -0.036278598, -0.057470374, 0.038883597, -0.025537029, 0.0012810708, -0.0042785355, 0.06919841, 0.03237531, -0.040509436, -0.10183149, -0.06037312, -0.019550124, -0.05629417, 0.07475169, -0.006572528, -0.06178848, -0.0017471531, -0.037176237, 0.0074103097, -0.0171616, -0.013143397, -0.031142933, -0.037537705, 0.020443702, 0.020992767, -0.08102101, 0.01346229, -0.05796864, 0.035196114, -0.027657399, -0.051900674, 0.024434835, 0.11076832, -0.01076129, -0.029325994, -0.044832803, 0.014564571, -0.010642134, 0.04925002, -0.099848576, 0.018992115, -0.07227981, 0.015174862, 0.05283973, 0.030575035, -0.0122303525, 0.05488553, -0.039935216, 0.06644076, -0.048201818, -0.07215202, 0.07534417, -0.089638636, -0.034018517, -0.026972547, 0.0036589468, 0.016017206, 0.05880774, 0.007948517, -0.026701761, 0.0913623, -0.047092043, -0.030011319, 0.031084377, 0.011169504, -0.081248075, 0.0024853167, 0.07678156, 0.05659484, -0.057400625, 0.016898401, -0.03635397, -0.03669536, 0.0434014, 0.02458235, 0.009596552, 0.08751737, -0.029217197, 0.011233325, 0.0976359, 0.052655842, -0.11937359, -0.062303647, -0.06244457, 0.08807406, -0.0882957, 0.05834653, 0.020138025, -0.059674628, -0.0051987236, 0.05766631, -0.076532446, -0.05564386, 0.07644322, -0.017506465, 0.045278337, -0.013249805, 0.011593899, -0.060265876, 0.01938485, 0.08545524, -0.02328853, -0.011818171, -0.032044597, 0.023500346, 0.074183665, 0.03152912, 0.061743114, -9.84546E-4, -0.001968116, -0.013993088, -0.040356196, 0.032418124, -0.09012429, 0.014020798, -0.06643449, 0.022772733, 0.053572316, 0.0015929339, 0.085229844, -0.0018870353, -0.080626, 0.035781212, -0.036744162, 0.03695423, -0.017534915, -0.030226598, 0.020552605, 0.07287197, -0.054664616, 0.053588815, -0.02225981, -0.07068558, 0.102010205, -0.022925237, 0.061874628, -0.02792686, 0.057174817, -0.0024679347, 0.034765754, 0.05497726, -0.02347704, 0.09718766, 0.0624977, 0.018514702, -0.034021024, 0.010482476, 0.02572729, 0.07822028, 0.016907735, 0.048230834, -0.11654213, -0.05333671, 0.03090191, -0.06550036, -0.009822582, 0.012966088, -0.0078824535, 0.10107768, -0.03993969, -0.002383274, -0.06556975, 0.05267322, 0.11976236, 0.015832867, 0.12010187, -0.06510945, -0.034379058, -0.029872132, -0.031294942, 0.011168219, 0.045075633, 0.07193406, 0.0107326275, 0.031243816, 0.007975038, 0.051437348, -0.032257486, -0.10472824, 0.031225154, 0.06973884, 0.09418269, -0.025745893, 0.065461196, 0.02490032, -0.0031282906, 0.009691013, -0.062396143, -0.0064097554, -0.0146010555, -0.069892704, -0.009253598, -0.019514546, -0.012665343, 0.015985252, 0.12815249]] |\n", + "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "results = pipeline.fit(data).transform(data)\n", + "results.select(\"e5.embeddings\").show(truncate=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Collect the results and save them to a Numpy array." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# collect embeddings as numpy array\n", + "embeddings = np.array([each[0][0] for each in results.select(\"e5.embeddings\").collect()])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Investigate the cosine similarity between the query and the passages." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[93.25909366721945, 74.10933523842462], [75.4203130378152, 92.58708611118642]]\n" + ] + } + ], + "source": [ + "scores = (embeddings[:2] @ embeddings[2:].T) * 100\n", + "print(scores.tolist())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:tempspark]", + "language": "python", + "name": "conda-env-tempspark-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/annotation/text/english/sentence-embeddings/InstructorEmbeddings.ipynb b/examples/python/annotation/text/english/sentence-embeddings/InstructorEmbeddings.ipynb new file mode 100644 index 00000000000000..1d973ceeb1a6cf --- /dev/null +++ b/examples/python/annotation/text/english/sentence-embeddings/InstructorEmbeddings.ipynb @@ -0,0 +1,202 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "78c1b739", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/sentence-embeddings/InstructorEmbeddings.ipynb)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Colab Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "from sparknlp.base import *\n", + "from sparknlp.common import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline\n", + "import pandas as pd\n", + "\n", + "# Comment out this line and uncomment the next one to enable GPU mode and High RAM\n", + "# \n", + "spark = sparknlp.start()\n", + "\n", + "# spark = sparknlp.start(gpu=True)\n", + "\n", + "print(\"Spark NLP version\", sparknlp.version())\n", + "print(\"Apache Spark version:\", spark.version)\n", + "\n", + "spark" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e9fa6df8", + "metadata": {}, + "source": [ + "# Download InstructorEmbedding Model and Create Spark NLP Pipeline" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "efef6c46", + "metadata": {}, + "source": [ + "Lets create a Spark NLP pipeline with the following stages:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a08d21e-a84a-433b-91e2-4b98e841f4bd", + "metadata": {}, + "outputs": [], + "source": [ + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol(\"text\") \\\n", + " .setOutputCol(\"documents\")\n", + "\n", + "instruction = InstructorEmbeddings.pretrained(name='instructor_base', lang='en') \\\n", + " .setInstruction(\"Represent the Wikipedia document for retrieval: \") \\\n", + " .setInputCols([\"documents\"]) \\\n", + " .setOutputCol(\"instructor\")\n", + "\n", + "# Build pipeline with BART\n", + "pipe_components = [document_assembler, instruction]\n", + "pipeline = Pipeline().setStages( pipe_components)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f3b0c64e", + "metadata": {}, + "source": [ + "Lets create a dataframe with some text to be annotated." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "06c931fb-9acc-41ba-a1b5-944732e55aa9", + "metadata": {}, + "outputs": [], + "source": [ + "data = spark.createDataFrame([\n", + " [1, \"\"\"Capitalism has been dominant in the Western world since the end of feudalism, but most feel[who?] that \n", + " the term \"mixed economies\" more precisely describes most contemporary economies, due to their containing both \n", + " private-owned and state-owned enterprises. In capitalism, prices determine the demand-supply scale. For \n", + " example, higher demand for certain goods and services lead to higher prices and lower demand for certain \n", + " goods lead to lower prices. \"\"\"]]).toDF(\"id\", \"text\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cb66bf86", + "metadata": {}, + "source": [ + "Extract the embeddings from the embeddings layer." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "95dd8de4-3840-408a-b08f-3b10cd47faa9", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 0:> (0 + 1) / 1]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "23/06/07 12:28:56 WARN MemoryStore: Not enough space to cache broadcast_1 in memory! (computed 4.6 GiB so far)\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|embeddings |\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[[0.03262866, -0.02278552, 0.0013627451, -0.05349423, -0.086116955, -5.929518E-4, -0.0746425, -0.044477966, -0.030355768, -0.045733023, 0.05812091, 0.03179538, -0.04334866, -0.08494021, -0.035193942, 0.024432851, -4.2305558E-4, 0.007649129, 0.03761272, -0.015318542, 0.006661357, -0.018444803, -0.020876005, 0.055248085, -0.06265088, -0.07534279, 0.056617893, -0.004320507, -0.0316281, 0.018066075, 0.03855368, 0.0048050773, 0.028971815, -0.006877347, 0.07095821, 9.4414165E-4, 0.008437616, -0.027222378, 0.019745044, -0.027784333, -0.01716024, 0.073865384, 0.014833191, -0.022548206, 0.046016835, -0.011504175, 0.026386736, -0.050758336, 0.04021454, 0.008387009, 0.010211223, -0.039326932, -0.0030433831, 0.0046180817, 0.028014615, -0.016920675, -0.042716026, -0.02837658, -0.035181597, 0.06269657, 0.0174563, 0.0054392195, -0.012997994, -0.012604179, 0.04189872, 0.06422939, 0.021105042, 0.004485108, -0.04081728, 0.02868578, 4.2648325E-4, -0.0024915151, -0.021758735, -0.008838901, -0.024631336, 0.038712267, 0.048298344, 0.029415213, 0.04060175, 0.0037318747, -0.07376663, 0.009977012, -0.0369224, 0.03830379, 0.023158496, -0.006534989, -0.022895986, -0.021891661, 0.009341288, 0.051562686, -0.041494038, -0.06153403, -0.022601144, -0.02245917, -0.01168978, 0.015870133, 0.037891928, -0.009542872, -0.020980844, -0.009183635, -0.012983468, 0.05018471, -0.0018250593, -0.070793994, -0.030277211, -0.02960738, 0.03202237, 0.06728596, -0.021340372, 0.044753116, 0.05349561, 0.054893903, 0.010459065, -0.038412116, 0.032823145, -0.04889416, 0.07951994, 0.009052965, -0.038682017, 0.104713686, 0.06529797, 0.03164878, -0.031003132, -0.018868608, -0.017530592, 0.013969608, 0.03172863, -0.0062067593, 0.0027590087, 8.379946E-4, 0.038555548, 0.01284134, 0.05865323, -0.020161793, 0.037459202, -0.08371534, -0.0065423665, -0.032452952, -0.055465214, -0.08255787, 0.027767248, -0.026513763, -0.004781667, 0.010080541, -0.065526746, 0.01758705, -0.01110518, 0.008830303, 0.018352255, 0.050216753, -0.044997294, -0.024659796, 0.010087487, -0.010251949, 0.01201215, -0.01594586, -0.03205041, 0.024571417, 0.041641567, -0.05499973, 0.05176098, -0.0125949895, -0.10548955, 0.029421275, -0.08137917, 0.03397447, -0.024931302, -0.009527485, -0.027410189, 0.02738372, 0.029404989, 0.020453911, 0.025400365, 0.0026869376, 0.048473097, -0.04090833, 0.0032688805, 0.078267716, 0.058016837, -0.015349757, -0.02964962, 7.428633E-4, 0.07016278, 0.062107064, 0.0016332245, -0.0023410108, 0.014560618, 2.6762972E-4, 0.025807912, 0.026444439, 0.018800316, -0.090043195, 0.008166241, 0.04428991, -0.0136914905, -0.014657232, -0.021842893, 0.07175007, 0.0054828576, 0.0033373928, 0.030236384, 0.011349291, 0.023523187, 0.026469443, -0.011958254, 0.018875133, 0.011467337, 0.011527503, -0.014620561, 0.014909634, -0.03518385, 0.08208324, 0.024886962, -0.015513791, -0.038096245, 0.039179526, 0.034246866, -0.028717602, -0.025548302, 0.0368702, 0.007931793, 0.026460685, -0.00364875, -0.010761328, 0.054106724, -0.039791096, -0.07631004, 0.038830508, 0.06268836, -0.025078528, -0.026704583, -0.016394533, 0.03193358, 0.036612213, -0.024061156, -0.025402015, -4.0322175E-4, -0.062252726, -0.059745993, 0.008429678, 0.03852402, -0.055612803, -0.012058595, -0.044576805, 0.005232973, 0.007215015, -0.04405842, -0.022040177, 6.000543E-4, -0.032302897, -0.033665285, -0.008454371, 0.023602277, 0.006704567, -0.033100456, -0.01915944, -0.029278649, -0.027450416, 0.007982758, 0.02155662, -0.06519412, -0.020685513, 0.036739424, -0.019138498, 0.007845404, -0.0036733786, 0.03498778, -0.008669832, 0.032490708, 0.021262223, -0.020452157, 0.042844422, 0.039352544, -0.02001723, 0.0046986653, 0.039409067, -0.01183346, -0.034159135, -0.032669347, 0.034656446, -0.0026955863, -0.058706068, -0.014528975, 0.011225362, -0.018192712, 0.024715172, -0.013345554, -0.06291966, 0.028073484, -0.020382162, 0.003955219, 0.051585477, -0.059672885, -0.028786313, -0.049310677, 0.022147192, -0.016901385, 0.043830827, 0.052221525, -0.007187432, 0.0077309003, 0.022526953, -0.015928626, -0.058199972, -0.021536296, -0.00266462, 0.021913854, 0.025801338, -0.07448505, -0.01068346, 0.026055792, 0.003932057, 0.030646225, 0.020291511, -0.022222407, -0.049949016, 0.034922905, 0.031354185, 0.0044451198, 0.033286795, 0.047622625, -0.039508287, 0.016014116, -0.012728955, -0.014505607, 0.051102817, 0.05280988, 0.043907918, 0.037854925, -0.077935375, -0.028005999, 0.05174665, -0.025033204, 0.028977908, 0.033971164, 0.080860905, -0.011379051, -6.125111E-4, 0.015531941, 0.029136108, -0.043808825, 0.021221649, -0.0036628984, -0.014902094, -0.022289492, -0.06090777, 0.045519885, 0.049842637, -0.03771459, -0.044942044, -0.0059738797, 0.008993613, 0.047185607, 0.03366262, 0.04698031, -0.033994306, 0.028037608, 0.015821587, -0.006695826, 0.010477219, 0.04892719, 0.01928435, 0.009722461, -0.009907463, 0.008717666, 0.022839326, -0.064767726, -0.08677013, 0.05109122, 0.06738159, 0.026279185, 0.017919093, 0.034425717, 0.012025932, -0.017000718, 0.01568243, -0.043481626, -0.066099755, -0.036917772, -0.053656776, -0.0073990393, -0.015794588, 0.05056942, 0.041876696, 0.08926835, 0.021864763, 0.008828377, -0.082242414, 0.0016147506, 0.01908765, 0.021238945, 0.01666622, 0.0137727065, 0.033908978, 0.008618177, -0.0415076, -0.02719992, 0.004980526, -0.013505137, -0.032252524, 0.027290987, 1.2833111E-4, 0.052202977, 0.05350614, 0.004707537, -0.001970308, -0.014455575, 0.029612198, -0.013719515, -0.013465883, -0.005294823, -0.037157647, -0.032241847, 0.010828067, 0.015217094, 0.010721237, 0.0109178, 0.023587238, 0.03139799, 0.028548352, 0.007332915, -0.034978863, 0.04591757, 0.07023253, -0.037546985, -0.0057449825, 0.01028897, 0.024115201, -0.05835967, 0.015728861, 0.007577405, 0.066445865, -0.0109921, 0.009578472, -0.018897576, 0.03284056, -0.0078084227, 0.022518951, 0.005147412, 0.038542423, -0.037588574, 0.01890949, -0.009531408, 0.0743924, 0.03420677, -0.00388404, 0.016388329, -0.05744358, 0.028442338, 0.022417674, 0.018632138, 0.058668043, -0.043136194, 0.02837622, 0.066264585, 0.031632416, 0.009375166, -0.021616708, -0.046904657, -0.011571442, -0.026009535, 0.005370311, 0.0013603637, -0.0054784706, -0.015194243, -0.040979613, -0.011726019, 0.041559637, 0.017277045, 0.009240102, -0.02853778, 0.05799228, -0.013103851, -0.081491776, -0.05242484, -0.0498567, -0.011498945, -0.030589731, -0.06344852, -0.039167684, 0.037541926, 0.025322966, 0.03585917, -0.06897881, 0.013983291, 0.07146157, -0.058129825, -0.012389879, 0.044209927, 0.003640095, -0.03880821, -0.0061029145, 0.024704812, 0.0130145475, -0.008639259, -0.07097213, -0.037620023, -0.06390397, 0.012232612, -0.05413426, -0.026953025, -0.008528179, 7.501531E-4, 0.06066094, -8.8505604E-4, 0.015460086, 0.0061789826, 0.02452416, -0.021089591, 0.042983003, 0.0565328, 0.049946148, 0.0053586145, 0.01441645, -0.014378864, 0.0024458088, 0.01483357, 0.031564116, 0.0065488485, -0.027081812, -0.072921626, -0.028170047, 0.054090757, 0.02424426, -0.005840947, 0.0025784718, -0.008157144, -0.06483979, -0.0625094, 0.07688747, -0.016405731, -0.0011064977, -0.046268202, -0.025038263, 0.029777573, -3.960068E-4, 0.018920312, -0.022698821, -0.056359243, -0.035240088, -0.054945026, 0.060251184, -0.007809836, 0.032234326, -0.011087703, -0.018321337, 0.018414224, -0.040390566, 0.024843065, -0.0031665978, 0.07884546, -0.0035033596, -0.05727815, 0.043211542, -0.008092797, 0.08816403, 0.040669575, 0.0035789618, -0.0021728065, -0.034989733, -0.06114682, 0.015779695, -0.05540598, 0.027801292, -0.055221975, -0.014599996, 0.008345163, -0.054678977, 0.0057505197, 0.009509776, 0.086219944, 0.0027056169, -0.012578616, -0.019075787, 0.06411902, -0.0015007671, 0.024660196, -0.02524281, 0.018707322, 1.7817405E-5, -0.0067316075, -0.07148548, -0.038645077, -0.074731976, 0.06418082, -0.0025319012, -0.024625022, 0.042892747, -0.015663806, 0.009740704, -0.028613178, 0.02524035, -0.025803398, -0.004221928, -0.035811353, 0.03623488, 0.05823448, -0.011964531, -0.006890309, -0.05232545, -0.021972118, 0.0039357906, -0.004678495, 0.023032501, 0.049222905, 0.05967132, 0.02711187, -0.028429024, 0.0597026, 0.008744213, 0.039600607, -0.01799164, 0.029899204, 0.04086879, -0.007106043, -0.024120256, -0.045608405, 0.02578875, 0.034966342, 0.051028572, -0.04749465, 0.050027035, -0.026982184, 0.023817034, -0.005344621, 0.033817418, -0.04080848, -0.04403071, -0.052517183, -0.0038946117, -0.033082206, 0.004636815, 0.026521523, -0.026402375, -0.002364965, -8.322088E-4, -0.0243335, 0.07126975, 0.009148769, 0.034467135, 0.012993687, -0.010860814, -0.044525325, 0.07306212, -0.048434332, -0.07555003, 0.03128192, -0.021581138, -0.05958449, 0.024803905, -0.006846708, -0.027540546, -0.02732654, -0.070398346, -0.018436609, -0.024355149, -0.027597638, 0.03856409, -0.021519573, -0.07238803, -0.087142274, 0.01989241, -0.02820587, 0.009047604, 0.036062315, -0.0041471967, -0.07270885, 0.018391611, 0.011368535, 0.030430099, -0.05325456, -0.030990591, -0.017405782, 0.0051884362, 0.008637634, 0.045851987, 0.010309327, -0.036312383, -0.026670815, -0.01030423, 0.00986426, -0.0049678143, 0.009745687, 0.0062699416, -0.02588413, 0.0044184714, 0.011547122, 0.026856875, 0.0025859745, 0.007370621, 0.009339734, -0.034624025, 0.015941104, -0.026927661, 0.014354207, -0.025712667, 0.015726479, 0.049310084, -0.050456293, -0.04896244, -0.0038412337, -0.018047825, 0.047674045, -0.064386874, 0.018670822, -0.026679922, 0.0319397, -0.024611896, 0.022062775, 0.032022644, -0.063417435, -0.012273067, 0.011973682, 6.4346654E-4, 0.0041928133, -0.018408531, -0.011742022, 0.009397221, 0.054278865, 0.043511264, -0.0127404295, -1.7254091E-4, -0.0070622903, -0.0041349228, -0.019627219, 0.023186563, -0.023316137, -0.06182324, -0.003907854, -2.5893108E-4, 0.06966795, -0.018856877, 3.508146E-4, 0.004480817, -0.0029769759, 0.024018057, 0.065449946, 0.044452745, 0.013281421, 0.047196712, 0.027858423, 0.007940354, 0.033436798, -0.019692471, 0.06342478, -0.021618921, -0.03593985, 0.02641698, -0.05189184, -0.043362506, -0.050688393, -0.005378341, 0.0033666724, 0.044950858, 0.02093605, 0.03626855, -0.024815692, 0.028359616, 0.021613698, -0.020997578, 0.0027879598, -0.017473606, 0.017178757, -0.030661179, -0.013124452, -0.01470106, -0.03801807, 0.053062066]]|\n", + "+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "results = pipeline.fit(data).transform(data)\n", + "results.select(\"instructor.embeddings\").show(truncate=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:tempspark]", + "language": "python", + "name": "conda-env-tempspark-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From b962451f0774612a87545aca6efe03f6ad8f94a7 Mon Sep 17 00:00:00 2001 From: Luca Martial <48870843+luca-martial@users.noreply.github.com> Date: Mon, 3 Jul 2023 12:03:56 +0200 Subject: [PATCH 11/13] Draft: Chore: conda recipe update (#13764) * update conda recipe * rm python build configs * update conda build instructions * update python version reqs * update recipe import test --------- Co-authored-by: Maziyar Panahi --- conda/README.md | 4 +-- conda/conda_build_config.yaml | 4 --- conda/meta.yaml | 46 ++++++++++++++++++++--------------- 3 files changed, 28 insertions(+), 26 deletions(-) delete mode 100644 conda/conda_build_config.yaml diff --git a/conda/README.md b/conda/README.md index 1ff6ddb645b173..b04e5a32133168 100644 --- a/conda/README.md +++ b/conda/README.md @@ -35,13 +35,13 @@ conda config --set anaconda_upload no Build `spark-nlp` from the latest PyPI tar: ```bash -conda build . --python=3.7 && conda build . --python=3.8 && conda build . --python=3.9 +conda build conda/ ``` Example of uploading Conda package to Anaconda Cloud: ```bash -anaconda upload /anaconda3/conda-bld/noarch/spark-nlp-version-py37_0.tar.bz2 +anaconda upload /anaconda3/conda-bld/noarch/spark-nlp-version-py_0.tar.bz2 ``` ## Install diff --git a/conda/conda_build_config.yaml b/conda/conda_build_config.yaml deleted file mode 100644 index 293f77e58aa60b..00000000000000 --- a/conda/conda_build_config.yaml +++ /dev/null @@ -1,4 +0,0 @@ -python: - - 3.7 - - 3.8 - - 3.9 diff --git a/conda/meta.yaml b/conda/meta.yaml index 3bf52ab98dc2cf..4ccecd03892a1e 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,30 +1,36 @@ -package: - name: "spark-nlp" - version: 4.4.4 +{% set name = "spark-nlp" %} +{% set version = "4.4.0" %} -app: - entry: spark-nlp - summary: Natural Language Understanding Library for Apache Spark. +package: + name: {{ name|lower }} + version: {{ version }} source: - fn: spark-nlp-4.4.4.tar.gz - url: https://files.pythonhosted.org/packages/f9/e4/5eb83ed1c68be9fca636f6c62f9e55da3f2e511818e2a8feb852d6986064/spark-nlp-4.4.4.tar.gz - sha256: d9e2f017ab7cf6e82e775c38862f1a4ee32bbb0af6619e0b9051e6737711b5b6 + url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/spark-nlp-{{ version }}.tar.gz + sha256: e76fdd82b966ca169ba8a1fdcfe2e684fc63abaf88de841d2eb881cacb5e0105 + build: - noarch: generic + noarch: python + script: {{ PYTHON }} -m pip install . -vv number: 0 - script: "python -m pip install . --no-deps -vv" requirements: - build: - - python + host: + - python >=3.7,<3.11 + - pip run: - - python + - python >=3.7,<3.11 + +test: + imports: + - sparknlp + commands: + - pip check + requires: + - pip about: - home: https://github.com/JohnSnowLabs/spark-nlp/ - license: Apache License 2.0 - license_family: APACHE - license_url: https://github.com/JohnSnowLabs/spark-nlp/blob/master/LICENSE - description: John Snow Labs Spark-NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment. - summary: Natural Language Understanding Library for Apache Spark. + home: https://github.com/JohnSnowLabs/spark-nlp + summary: John Snow Labs Spark NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment. + license: Apache-2.0 + license_file: LICENSE From 0a4a8c208873aedeac40bf120d4f470247d4b5b8 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Mon, 3 Jul 2023 15:36:53 +0200 Subject: [PATCH 12/13] Update CHANGELOG [run doc] --- CHANGELOG | 18 ++++++++++ README.md | 70 ++++++++++++++++++++------------------ docs/_layouts/landing.html | 4 ++- docs/en/transformers.md | 11 +++++- python/README.md | 70 ++++++++++++++++++++------------------ 5 files changed, 105 insertions(+), 68 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 30f8d250436dee..3e611cda48316f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,21 @@ +======== +5.0.0 +======== +---------------- +New Features & Enhancements +---------------- +* **NEW:** Introducing support for ONNX Runtime in Spark NLP. ONNX Runtime is a high-performance inference engine for machine learning models in the ONNX format. ONNX Runtime has proved to considerably increase the performance of inference for many models. +* **NEW:** Introducing **InstructorEmbeddings** annotator in Spark NLP 🚀. `InstructorEmbeddings` can load new state-of-the-art INSTRUCTOR Models inherited from T5 for Text Embeddings. +* **NEW:** Introducing **E5Embeddings** annotator in Spark NLP 🚀. `E5Embeddings` can load new state-of-the-art E5 Models inherited from BERT for Text Embeddings. +* **NEW:** Introducing **DocumentSimilarityRanker** annotator in Spark NLP 🚀. `DocumentSimilarityRanker` is a new annotator that uses LSH techniques present in Spark ML lib to execute approximate nearest neighbours search on top of sentence embeddings, It aims to capture the semantic meaning of a document in a dense, continuous vector space and return it to the ranker search. + +---------------- +Bug Fixes +---------------- +* Fix BART issue with maxInputLength + + + ======== 4.4.4 ======== diff --git a/README.md b/README.md index 2e6dd3d53d5a51..a002cf66f1c285 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ environment. Spark NLP comes with **17000+** pretrained **pipelines** and **models** in more than **200+** languages. It also offers tasks such as **Tokenization**, **Word Segmentation**, **Part-of-Speech Tagging**, Word and Sentence **Embeddings**, **Named Entity Recognition**, **Dependency Parsing**, **Spell Checking**, **Text Classification**, **Sentiment Analysis**, **Token Classification**, **Machine Translation** (+180 languages), **Summarization**, **Question Answering**, **Table Question Answering**, **Text Generation**, **Image Classification**, **Automatic Speech Recognition**, **Zero-Shot Learning**, and many more [NLP tasks](#features). -**Spark NLP** is the only open-source NLP library in **production** that offers state-of-the-art transformers such as **BERT**, **CamemBERT**, **ALBERT**, **ELECTRA**, **XLNet**, **DistilBERT**, **RoBERTa**, **DeBERTa**, **XLM-RoBERTa**, **Longformer**, **ELMO**, **Universal Sentence Encoder**, **Facebook BART**, **Google T5**, **MarianMT**, **OpenAI GPT2**, and **Vision Transformers (ViT)** not only to **Python** and **R**, but also to **JVM** ecosystem (**Java**, **Scala**, and **Kotlin**) at **scale** by extending **Apache Spark** natively. +**Spark NLP** is the only open-source NLP library in **production** that offers state-of-the-art transformers such as **BERT**, **CamemBERT**, **ALBERT**, **ELECTRA**, **XLNet**, **DistilBERT**, **RoBERTa**, **DeBERTa**, **XLM-RoBERTa**, **Longformer**, **ELMO**, **Universal Sentence Encoder**, **Facebook BART**, **Instructor**, **E5**, **Google T5**, **MarianMT**, **OpenAI GPT2**, and **Vision Transformers (ViT)** not only to **Python** and **R**, but also to **JVM** ecosystem (**Java**, **Scala**, and **Kotlin**) at **scale** by extending **Apache Spark** natively. ## Project's website @@ -48,25 +48,25 @@ documentation and examples - [Databricks Support](#databricks-support) - [EMR Support](#emr-support) - [Using Spark NLP](#usage) - - [Packages Cheatsheet](#packages-cheatsheet) - - [Spark Packages](#spark-packages) - - [Scala](#scala) - - [Maven](#maven) - - [SBT](#sbt) - - [Python](#python) - - [Pip/Conda](#pipconda) - - [Compiled JARs](#compiled-jars) - - [Apache Zeppelin](#apache-zeppelin) - - [Jupyter Notebook](#jupyter-notebook-python) - - [Google Colab Notebook](#google-colab-notebook) - - [Kaggle Kernel](#kaggle-kernel) - - [Databricks Cluster](#databricks-cluster) - - [EMR Cluster](#emr-cluster) - - [GCP Dataproc](#gcp-dataproc) - - [Spark NLP Configuration](#spark-nlp-configuration) + - [Packages Cheatsheet](#packages-cheatsheet) + - [Spark Packages](#spark-packages) + - [Scala](#scala) + - [Maven](#maven) + - [SBT](#sbt) + - [Python](#python) + - [Pip/Conda](#pipconda) + - [Compiled JARs](#compiled-jars) + - [Apache Zeppelin](#apache-zeppelin) + - [Jupyter Notebook](#jupyter-notebook-python) + - [Google Colab Notebook](#google-colab-notebook) + - [Kaggle Kernel](#kaggle-kernel) + - [Databricks Cluster](#databricks-cluster) + - [EMR Cluster](#emr-cluster) + - [GCP Dataproc](#gcp-dataproc) + - [Spark NLP Configuration](#spark-nlp-configuration) - [Pipelines & Models](#pipelines-and-models) - - [Pipelines](#pipelines) - - [Models](#models) + - [Pipelines](#pipelines) + - [Models](#models) - [Offline](#offline) - [Examples](#examples) - [FAQ](#faq) @@ -110,6 +110,8 @@ documentation and examples - BERT Sentence Embeddings (TF Hub & HuggingFace models) - RoBerta Sentence Embeddings (HuggingFace models) - XLM-RoBerta Sentence Embeddings (HuggingFace models) +- Instructor Embeddings (HuggingFace models) +- E5 Embeddings (HuggingFace models) - Sentence Embeddings - Chunk Embeddings - Unsupervised keywords extraction @@ -226,24 +228,23 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *5.0.0* has been built on top of Apache Spark 3.2 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x +Spark NLP *5.0.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x | Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | Apache Spark 3.4.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| +| 5.0.x | NO | NO | YES | YES | YES | YES | YES | | 4.4.x | NO | NO | YES | YES | YES | YES | YES | -| 4.3.x | NO | NO | YES | YES | YES | YES | NO | -| 4.2.x | NO | NO | YES | YES | YES | YES | NO | -| 4.1.x | NO | NO | YES | YES | YES | YES | NO | -| 4.0.x | NO | NO | YES | YES | YES | YES | NO | -| 3.4.x | YES | YES | YES | YES | Partially | N/A | NO +| 4.3.x | NO | NO | YES | YES | YES | YES | NO | +| 4.2.x | NO | NO | YES | YES | YES | YES | NO | +| 4.1.x | NO | NO | YES | YES | YES | YES | NO | +| 4.0.x | NO | NO | YES | YES | YES | YES | NO | +| 3.4.x | YES | YES | YES | YES | Partially | N/A | NO | | 3.3.x | YES | YES | YES | YES | NO | NO | NO | | 3.2.x | YES | YES | YES | YES | NO | NO | NO | | 3.1.x | YES | YES | YES | YES | NO | NO | NO | | 3.0.x | YES | YES | YES | YES | NO | NO | NO | | 2.7.x | YES | YES | NO | NO | NO | NO | NO | -NOTE: Starting 4.0.0 release, the default `spark-nlp` and `spark-nlp-gpu` packages are based on Scala 2.12.15 and Apache -Spark 3.2 by default. Find out more about `Spark NLP` versions from our [release notes](https://github.com/JohnSnowLabs/spark-nlp/releases). @@ -251,6 +252,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github | Spark NLP | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10| Scala 2.11 | Scala 2.12 | |-----------|------------|------------|------------|------------|------------|------------|------------| +| 5.0.x | NO | YES | YES | YES | YES | NO | YES | | 4.4.x | NO | YES | YES | YES | YES | NO | YES | | 4.3.x | YES | YES | YES | YES | YES | NO | YES | | 4.2.x | YES | YES | YES | YES | YES | NO | YES | @@ -269,8 +271,6 @@ Spark NLP 5.0.0 has been tested and is compatible with the following runtimes: **CPU:** -- 7.3 -- 7.3 ML - 9.1 - 9.1 ML - 10.1 @@ -299,6 +299,10 @@ Spark NLP 5.0.0 has been tested and is compatible with the following runtimes: - 12.2 ML - 13.0 - 13.0 ML +- 13.1 +- 13.1 ML +- 13.2 +- 13.2 ML **GPU:** @@ -316,9 +320,8 @@ Spark NLP 5.0.0 has been tested and is compatible with the following runtimes: - 12.1 ML & GPU - 12.2 ML & GPU - 13.0 ML & GPU - -NOTE: Spark NLP 4.x is based on TensorFlow 2.7.x which is compatible with CUDA11 and cuDNN 8.0.2. The only Databricks -runtimes supporting CUDA 11 are 9.x and above as listed under GPU. +- 13.1 ML & GPU +- 13.2 ML & GPU ## EMR Support @@ -334,6 +337,7 @@ Spark NLP 5.0.0 has been tested and is compatible with the following EMR release - emr-6.8.0 - emr-6.9.0 - emr-6.10.0 +- emr-6.11.0 Full list of [Amazon EMR 6.x releases](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-release-6x.html) @@ -690,7 +694,7 @@ pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.0 Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` If not using pyspark at all, you'll have to run the instructions -pointed [here](#python-without-explicit-Pyspark-installation) +pointed [here](#python-without-explicit-pyspark-installation) ## Google Colab Notebook diff --git a/docs/_layouts/landing.html b/docs/_layouts/landing.html index b1724ad7c1e212..01c747fc2a7da8 100755 --- a/docs/_layouts/landing.html +++ b/docs/_layouts/landing.html @@ -224,7 +224,7 @@

    Transformers at Scale

    Unlock the power of Large Language Models with Spark NLP 🚀, the only open-source library that delivers cutting-edge transformers for production such as BERT, CamemBERT, ALBERT, ELECTRA, XLNet, DistilBERT, RoBERTa, DeBERTa, - XLM-RoBERTa, Longformer, ELMO, Universal Sentence Encoder, Facebook BART, Google T5, MarianMT, OpenAI GPT2, + XLM-RoBERTa, Longformer, ELMO, Universal Sentence Encoder, Facebook BART, Instructor Embeddings, E5 Embeddings, Google T5, MarianMT, OpenAI GPT2, Google ViT, ASR Wav2Vec2 and many more not only to Python, and R but also to JVM ecosystem (Java, Scala, and Kotlin) at scale by extending Apache Spark natively
    @@ -304,6 +304,8 @@

    NLP Features

  • Universal Sentence Encoder
  • Sentence Embeddings
  • Chunk Embeddings
  • +
  • Instructor Embeddings
  • +
  • E5 Embeddings
  • diff --git a/docs/api/com/johnsnowlabs/ml/util/ModelEngine$.html b/docs/api/com/johnsnowlabs/ml/util/ModelArch$.html similarity index 80% rename from docs/api/com/johnsnowlabs/ml/util/ModelEngine$.html rename to docs/api/com/johnsnowlabs/ml/util/ModelArch$.html index 905476e7d5874a..61e9585ccddfc0 100644 --- a/docs/api/com/johnsnowlabs/ml/util/ModelEngine$.html +++ b/docs/api/com/johnsnowlabs/ml/util/ModelArch$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.ml.util.ModelEngine - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.ml.util.ModelArch + + @@ -28,7 +28,7 @@ +
  • + + + + + + + + + val + + + wordEmbeddings: String + + + +
  • + + + + + + + + + val + + + zeroShotClassifier: String + + +
  • diff --git a/docs/api/com/johnsnowlabs/ml/util/ModelEngine.html b/docs/api/com/johnsnowlabs/ml/util/ModelEngine.html new file mode 100644 index 00000000000000..c3450617693e46 --- /dev/null +++ b/docs/api/com/johnsnowlabs/ml/util/ModelEngine.html @@ -0,0 +1,639 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.ml.util.ModelEngine + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + +
    +
    t
    +

    com.johnsnowlabs.ml.util

    +

    ModelEngine + + + +

    +

    +
    + +

    + + sealed + trait + + + ModelEngine extends AnyRef + +

    + + +
    + + Linear Supertypes + +
    AnyRef, Any
    +
    + + Known Subclasses + + +
    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. ModelEngine
    2. AnyRef
    3. Any
    4. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    + + + + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    5. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    6. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    7. + + + + + + + + + def + + + equals(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    10. + + + + + + + + + def + + + hashCode(): Int + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    11. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    12. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    13. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    14. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    15. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    16. + + + + + + + + + def + + + toString(): String + + +
      Definition Classes
      AnyRef → Any
      +
    17. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    18. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    19. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    20. +
    +
    + + + + +
    + +
    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/ml/util/ONNX$.html b/docs/api/com/johnsnowlabs/ml/util/ONNX$.html new file mode 100644 index 00000000000000..5a78363c7891ff --- /dev/null +++ b/docs/api/com/johnsnowlabs/ml/util/ONNX$.html @@ -0,0 +1,689 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.ml.util.ONNX + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + +
    +
    o
    +

    com.johnsnowlabs.ml.util

    +

    ONNX + + + +

    +

    +
    + +

    + + + object + + + ONNX extends ModelEngine with Product with Serializable + +

    + + +
    + + Linear Supertypes + +
    Serializable, Serializable, Product, Equals, ModelEngine, AnyRef, Any
    +
    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. ONNX
    2. Serializable
    3. Serializable
    4. Product
    5. Equals
    6. ModelEngine
    7. AnyRef
    8. Any
    9. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    + + + + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    5. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    6. + + + + + + + + + val + + + decoderModel: String + + + +
    7. + + + + + + + + + val + + + decoderWithPastModel: String + + + +
    8. + + + + + + + + + val + + + encoderModel: String + + + +
    9. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    10. + + + + + + + + + def + + + equals(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    11. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    12. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    13. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    14. + + + + + + + + + val + + + modelName: String + + + +
    15. + + + + + + + + + val + + + name: String + + + +
    16. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    17. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    18. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    19. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    20. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    21. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    22. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    23. +
    +
    + + + + +
    + +
    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Product

    +
    +

    Inherited from Equals

    +
    +

    Inherited from ModelEngine

    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/ml/util/PyTorch$.html b/docs/api/com/johnsnowlabs/ml/util/PyTorch$.html new file mode 100644 index 00000000000000..b867c74a574605 --- /dev/null +++ b/docs/api/com/johnsnowlabs/ml/util/PyTorch$.html @@ -0,0 +1,625 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.ml.util.PyTorch + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + +
    +
    o
    +

    com.johnsnowlabs.ml.util

    +

    PyTorch + + + +

    +

    +
    + +

    + + + object + + + PyTorch extends ModelEngine with Product with Serializable + +

    + + +
    + + Linear Supertypes + +
    Serializable, Serializable, Product, Equals, ModelEngine, AnyRef, Any
    +
    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. PyTorch
    2. Serializable
    3. Serializable
    4. Product
    5. Equals
    6. ModelEngine
    7. AnyRef
    8. Any
    9. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    + + + + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    5. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    6. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    7. + + + + + + + + + def + + + equals(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    10. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    11. + + + + + + + + + val + + + name: String + + + +
    12. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    13. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    14. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    15. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    16. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    17. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    18. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    19. +
    +
    + + + + +
    + +
    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Product

    +
    +

    Inherited from Equals

    +
    +

    Inherited from ModelEngine

    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/ml/util/TensorFlow$.html b/docs/api/com/johnsnowlabs/ml/util/TensorFlow$.html new file mode 100644 index 00000000000000..b4961aa32387a0 --- /dev/null +++ b/docs/api/com/johnsnowlabs/ml/util/TensorFlow$.html @@ -0,0 +1,641 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.ml.util.TensorFlow + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + +
    +
    o
    +

    com.johnsnowlabs.ml.util

    +

    TensorFlow + + + +

    +

    +
    + +

    + + + object + + + TensorFlow extends ModelEngine with Product with Serializable + +

    + + +
    + + Linear Supertypes + +
    Serializable, Serializable, Product, Equals, ModelEngine, AnyRef, Any
    +
    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. TensorFlow
    2. Serializable
    3. Serializable
    4. Product
    5. Equals
    6. ModelEngine
    7. AnyRef
    8. Any
    9. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    + + + + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    5. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    6. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    7. + + + + + + + + + def + + + equals(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    10. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    11. + + + + + + + + + val + + + modelName: String + + + +
    12. + + + + + + + + + val + + + name: String + + + +
    13. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    14. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    15. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    16. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    17. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    18. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    19. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    20. +
    +
    + + + + +
    + +
    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Product

    +
    +

    Inherited from Equals

    +
    +

    Inherited from ModelEngine

    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/ml/util/Unknown$.html b/docs/api/com/johnsnowlabs/ml/util/Unknown$.html new file mode 100644 index 00000000000000..92aa904a5ffc43 --- /dev/null +++ b/docs/api/com/johnsnowlabs/ml/util/Unknown$.html @@ -0,0 +1,625 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.ml.util.Unknown + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + +
    +
    o
    +

    com.johnsnowlabs.ml.util

    +

    Unknown + + + +

    +

    +
    + +

    + + + object + + + Unknown extends ModelEngine with Product with Serializable + +

    + + +
    + + Linear Supertypes + +
    Serializable, Serializable, Product, Equals, ModelEngine, AnyRef, Any
    +
    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. Unknown
    2. Serializable
    3. Serializable
    4. Product
    5. Equals
    6. ModelEngine
    7. AnyRef
    8. Any
    9. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    + + + + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    5. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    6. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    7. + + + + + + + + + def + + + equals(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    10. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    11. + + + + + + + + + val + + + name: String + + + +
    12. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    13. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    14. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    15. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    16. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    17. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    18. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    19. +
    +
    + + + + +
    + +
    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Product

    +
    +

    Inherited from Equals

    +
    +

    Inherited from ModelEngine

    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/ml/util/index.html b/docs/api/com/johnsnowlabs/ml/util/index.html index 83e9ccb2a682ce..7cdfbf40e2ab7d 100644 --- a/docs/api/com/johnsnowlabs/ml/util/index.html +++ b/docs/api/com/johnsnowlabs/ml/util/index.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.ml.util - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.ml.util + + @@ -28,7 +28,7 @@ diff --git a/docs/api/com/johnsnowlabs/nlp/AnnotatorModel.html b/docs/api/com/johnsnowlabs/nlp/AnnotatorModel.html index fe7aca67c5c9c5..11c40c229b1072 100644 --- a/docs/api/com/johnsnowlabs/nlp/AnnotatorModel.html +++ b/docs/api/com/johnsnowlabs/nlp/AnnotatorModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.AnnotatorModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.AnnotatorModel + + @@ -28,7 +28,7 @@ diff --git a/docs/api/com/johnsnowlabs/nlp/AnnotatorType$.html b/docs/api/com/johnsnowlabs/nlp/AnnotatorType$.html index 3a62c406ec5664..69cffb502ca09d 100644 --- a/docs/api/com/johnsnowlabs/nlp/AnnotatorType$.html +++ b/docs/api/com/johnsnowlabs/nlp/AnnotatorType$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.AnnotatorType - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.AnnotatorType + + @@ -28,7 +28,7 @@ diff --git a/docs/api/com/johnsnowlabs/nlp/Doc2Chunk$.html b/docs/api/com/johnsnowlabs/nlp/Doc2Chunk$.html index 46df9e231f766c..a2f08cce1b5396 100644 --- a/docs/api/com/johnsnowlabs/nlp/Doc2Chunk$.html +++ b/docs/api/com/johnsnowlabs/nlp/Doc2Chunk$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.Doc2Chunk - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.Doc2Chunk + + @@ -28,7 +28,7 @@ diff --git a/docs/api/com/johnsnowlabs/nlp/HasBatchedAnnotateAudio.html b/docs/api/com/johnsnowlabs/nlp/HasBatchedAnnotateAudio.html index 9bbe083f31d62a..37156313f56ddc 100644 --- a/docs/api/com/johnsnowlabs/nlp/HasBatchedAnnotateAudio.html +++ b/docs/api/com/johnsnowlabs/nlp/HasBatchedAnnotateAudio.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.HasBatchedAnnotateAudio - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.HasBatchedAnnotateAudio + + @@ -28,7 +28,7 @@ diff --git a/docs/api/com/johnsnowlabs/nlp/HasClassifierActivationProperties.html b/docs/api/com/johnsnowlabs/nlp/HasClassifierActivationProperties.html index 89c3bc7410e3a1..445f4496d7b2ab 100644 --- a/docs/api/com/johnsnowlabs/nlp/HasClassifierActivationProperties.html +++ b/docs/api/com/johnsnowlabs/nlp/HasClassifierActivationProperties.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.HasClassifierActivationProperties - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.HasClassifierActivationProperties + + @@ -28,7 +28,7 @@ diff --git a/docs/api/com/johnsnowlabs/nlp/HasEngine.html b/docs/api/com/johnsnowlabs/nlp/HasEngine.html index 83aa0717ba10d4..55ee19cceb106f 100644 --- a/docs/api/com/johnsnowlabs/nlp/HasEngine.html +++ b/docs/api/com/johnsnowlabs/nlp/HasEngine.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.HasEngine - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.HasEngine + + @@ -28,7 +28,7 @@ diff --git a/docs/api/com/johnsnowlabs/nlp/HasFeatures.html b/docs/api/com/johnsnowlabs/nlp/HasFeatures.html index 87daffa01b8f30..a466cf6e483402 100644 --- a/docs/api/com/johnsnowlabs/nlp/HasFeatures.html +++ b/docs/api/com/johnsnowlabs/nlp/HasFeatures.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.HasFeatures - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.HasFeatures + + @@ -28,7 +28,7 @@ diff --git a/docs/api/com/johnsnowlabs/nlp/HasImageFeatureProperties.html b/docs/api/com/johnsnowlabs/nlp/HasImageFeatureProperties.html index 30d06cb5432a51..05b7ce87c772bd 100644 --- a/docs/api/com/johnsnowlabs/nlp/HasImageFeatureProperties.html +++ b/docs/api/com/johnsnowlabs/nlp/HasImageFeatureProperties.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.HasImageFeatureProperties - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.HasImageFeatureProperties + + @@ -28,7 +28,7 @@ diff --git a/docs/api/com/johnsnowlabs/nlp/HasMultipleInputAnnotationCols.html b/docs/api/com/johnsnowlabs/nlp/HasMultipleInputAnnotationCols.html index 601c83264af725..7c26ae5f4e658c 100644 --- a/docs/api/com/johnsnowlabs/nlp/HasMultipleInputAnnotationCols.html +++ b/docs/api/com/johnsnowlabs/nlp/HasMultipleInputAnnotationCols.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.HasMultipleInputAnnotationCols - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.HasMultipleInputAnnotationCols + + @@ -28,7 +28,7 @@ diff --git a/docs/api/com/johnsnowlabs/nlp/HasOutputAnnotatorType.html b/docs/api/com/johnsnowlabs/nlp/HasOutputAnnotatorType.html index 9b5cf747a8097b..2b6621b17a23dd 100644 --- a/docs/api/com/johnsnowlabs/nlp/HasOutputAnnotatorType.html +++ b/docs/api/com/johnsnowlabs/nlp/HasOutputAnnotatorType.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.HasOutputAnnotatorType - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.HasOutputAnnotatorType + + @@ -28,7 +28,7 @@ diff --git a/docs/api/com/johnsnowlabs/nlp/HasPretrained.html b/docs/api/com/johnsnowlabs/nlp/HasPretrained.html index 6bbc8d14fdf67c..b977e9dd50e440 100644 --- a/docs/api/com/johnsnowlabs/nlp/HasPretrained.html +++ b/docs/api/com/johnsnowlabs/nlp/HasPretrained.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.HasPretrained - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.HasPretrained + + @@ -28,7 +28,7 @@ diff --git a/docs/api/com/johnsnowlabs/nlp/HasProtectedParams$ProtectedParam.html b/docs/api/com/johnsnowlabs/nlp/HasProtectedParams$ProtectedParam.html index e9accc8464b9b8..e065b1966ce307 100644 --- a/docs/api/com/johnsnowlabs/nlp/HasProtectedParams$ProtectedParam.html +++ b/docs/api/com/johnsnowlabs/nlp/HasProtectedParams$ProtectedParam.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.HasProtectedParams.ProtectedParam - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.HasProtectedParams.ProtectedParam + + @@ -28,7 +28,7 @@

    Inherited from ReadBertDLModel

    +
    +

    Inherited from ReadOnnxModel

    Inherited from ReadTensorflowModel

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.html index ca97e4e558d8b4..9480e86678acb0 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.BertEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.BertEmbeddings + + @@ -28,7 +28,7 @@ @@ -506,7 +530,7 @@

    Inherited
      -
    1. BertEmbeddings
    2. HasEngine
    3. HasCaseSensitiveProperties
    4. HasStorageRef
    5. HasEmbeddingsProperties
    6. HasProtectedParams
    7. WriteTensorflowModel
    8. HasBatchedAnnotate
    9. AnnotatorModel
    10. CanBeLazy
    11. RawAnnotator
    12. HasOutputAnnotationCol
    13. HasInputAnnotationCols
    14. HasOutputAnnotatorType
    15. ParamsAndFeaturesWritable
    16. HasFeatures
    17. DefaultParamsWritable
    18. MLWritable
    19. Model
    20. Transformer
    21. PipelineStage
    22. Logging
    23. Params
    24. Serializable
    25. Serializable
    26. Identifiable
    27. AnyRef
    28. Any
    29. +
    30. BertEmbeddings
    31. HasEngine
    32. HasCaseSensitiveProperties
    33. HasStorageRef
    34. HasEmbeddingsProperties
    35. HasProtectedParams
    36. WriteOnnxModel
    37. WriteTensorflowModel
    38. HasBatchedAnnotate
    39. AnnotatorModel
    40. CanBeLazy
    41. RawAnnotator
    42. HasOutputAnnotationCol
    43. HasInputAnnotationCols
    44. HasOutputAnnotatorType
    45. ParamsAndFeaturesWritable
    46. HasFeatures
    47. DefaultParamsWritable
    48. MLWritable
    49. Model
    50. Transformer
    51. PipelineStage
    52. Logging
    53. Params
    54. Serializable
    55. Serializable
    56. Identifiable
    57. AnyRef
    58. Any

    @@ -526,7 +550,7 @@

    Instance Constructors

    -
    1. +
      1. @@ -541,7 +565,9 @@

        Instance Constructors

        BertEmbeddings()
        - +

        Annotator reference id.

        Annotator reference id. Used to identify elements in metadata or to refer to this annotator +type +

      2. @@ -1473,7 +1499,7 @@

        Value Members

        -
      3. +
      4. @@ -1488,7 +1514,7 @@

        Value Members

        getModelIfNotSet: Bert
        -

        +
      5. @@ -1668,7 +1694,7 @@

        Value Members

        Attributes
        protected
        Definition Classes
        Logging
        -
      6. +
      7. @@ -1683,8 +1709,8 @@

        Value Members

        inputAnnotatorTypes: Array[String]
        -

        Annotator reference id.

      8. @@ -2097,7 +2123,7 @@

        Value Members

        Definition Classes
        HasInputAnnotationCols
        -
      9. +
      10. @@ -2112,7 +2138,9 @@

        Value Members

        outputAnnotatorType:
        AnnotatorType
        -
        Definition Classes
        BertEmbeddingsHasOutputAnnotatorType
        +

        Output Annotator Types: WORD_EMBEDDINGS +

        Output Annotator Types: WORD_EMBEDDINGS +

        Definition Classes
        BertEmbeddingsHasOutputAnnotatorType
      11. @@ -2586,9 +2614,9 @@

        Value Members

      12. - + - + @@ -2597,7 +2625,7 @@

        Value Members

        def - setModelIfNotSet(spark: SparkSession, tensorflowWrapper: TensorflowWrapper): BertEmbeddings + setModelIfNotSet(spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper]): BertEmbeddings

        @@ -3070,6 +3098,22 @@

        Value Members

        Definition Classes
        ParamsAndFeaturesWritable → DefaultParamsWritable → MLWritable
        +
      13. + + + + + + + + + def + + + writeOnnxModel(path: String, spark: SparkSession, onnxWrapper: OnnxWrapper, suffix: String, fileName: String): Unit + + +
        Definition Classes
        WriteOnnxModel
      14. @@ -3138,6 +3182,8 @@

        Inherited from Inherited from HasEmbeddingsProperties

    Inherited from HasProtectedParams

    +
    +

    Inherited from WriteOnnxModel

    Inherited from WriteTensorflowModel

    @@ -3192,6 +3238,10 @@

    Parameters

    A list of (hyper-)parameter keys this annotator can take. Users can set and get the parameter values through setters and getters, respectively.

    +
    +

    Annotator types

    +

    + Required input and expected output annotator types

    Members

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings$.html index 564012bbe8cca2..d79e2195acf2eb 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.BertSentenceEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.BertSentenceEmbeddings + + @@ -28,7 +28,7 @@ @@ -439,7 +463,7 @@

    Inherited
      -
    1. BertSentenceEmbeddings
    2. Serializable
    3. Serializable
    4. ReadBertSentenceDLModel
    5. ReadTensorflowModel
    6. ReadablePretrainedBertSentenceModel
    7. HasPretrained
    8. ParamsAndFeaturesReadable
    9. DefaultParamsReadable
    10. MLReadable
    11. AnyRef
    12. Any
    13. +
    14. BertSentenceEmbeddings
    15. Serializable
    16. Serializable
    17. ReadBertSentenceDLModel
    18. ReadOnnxModel
    19. ReadTensorflowModel
    20. ReadablePretrainedBertSentenceModel
    21. HasPretrained
    22. ParamsAndFeaturesReadable
    23. DefaultParamsReadable
    24. MLReadable
    25. AnyRef
    26. Any

    @@ -818,6 +842,22 @@

    Value Members

    @native()
    +
  • + + + + + + + + + val + + + onnxFile: String + + +
    Definition Classes
    ReadBertSentenceDLModelReadOnnxModel
  • @@ -914,6 +954,22 @@

    Value Members

    Definition Classes
    ReadBertSentenceDLModel
    +
  • + + + + + + + + + def + + + readOnnxModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, sessionOptions: Option[SessionOptions] = None): OnnxWrapper + + +
    Definition Classes
    ReadOnnxModel
  • @@ -1110,6 +1166,8 @@

    Inherited from SerializableInherited from Serializable

  • Inherited from ReadBertSentenceDLModel

    +
    +

    Inherited from ReadOnnxModel

    Inherited from ReadTensorflowModel

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings.html index 987b8b92548395..f5ad8e568eee8d 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.BertSentenceEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.BertSentenceEmbeddings + + @@ -28,7 +28,7 @@
    @@ -502,7 +526,7 @@

    Inherited
      -
    1. BertSentenceEmbeddings
    2. HasEngine
    3. HasCaseSensitiveProperties
    4. HasStorageRef
    5. HasEmbeddingsProperties
    6. HasProtectedParams
    7. WriteTensorflowModel
    8. HasBatchedAnnotate
    9. AnnotatorModel
    10. CanBeLazy
    11. RawAnnotator
    12. HasOutputAnnotationCol
    13. HasInputAnnotationCols
    14. HasOutputAnnotatorType
    15. ParamsAndFeaturesWritable
    16. HasFeatures
    17. DefaultParamsWritable
    18. MLWritable
    19. Model
    20. Transformer
    21. PipelineStage
    22. Logging
    23. Params
    24. Serializable
    25. Serializable
    26. Identifiable
    27. AnyRef
    28. Any
    29. +
    30. BertSentenceEmbeddings
    31. HasEngine
    32. HasCaseSensitiveProperties
    33. HasStorageRef
    34. HasEmbeddingsProperties
    35. HasProtectedParams
    36. WriteOnnxModel
    37. WriteTensorflowModel
    38. HasBatchedAnnotate
    39. AnnotatorModel
    40. CanBeLazy
    41. RawAnnotator
    42. HasOutputAnnotationCol
    43. HasInputAnnotationCols
    44. HasOutputAnnotatorType
    45. ParamsAndFeaturesWritable
    46. HasFeatures
    47. DefaultParamsWritable
    48. MLWritable
    49. Model
    50. Transformer
    51. PipelineStage
    52. Logging
    53. Params
    54. Serializable
    55. Serializable
    56. Identifiable
    57. AnyRef
    58. Any

    @@ -2639,9 +2663,9 @@

    Value Members

    Max sentence length to process (Default: 128)

  • - + - + @@ -2650,7 +2674,7 @@

    Value Members

    def - setModelIfNotSet(spark: SparkSession, tensorflow: TensorflowWrapper): BertSentenceEmbeddings.this.type + setModelIfNotSet(spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper]): BertSentenceEmbeddings.this.type

    @@ -3124,6 +3148,22 @@

    Value Members

    Definition Classes
    ParamsAndFeaturesWritable → DefaultParamsWritable → MLWritable
    +
  • + + + + + + + + + def + + + writeOnnxModel(path: String, spark: SparkSession, onnxWrapper: OnnxWrapper, suffix: String, fileName: String): Unit + + +
    Definition Classes
    WriteOnnxModel
  • @@ -3192,6 +3232,8 @@

    Inherited from Inherited from HasEmbeddingsProperties

  • Inherited from HasProtectedParams

    +
    +

    Inherited from WriteOnnxModel

    Inherited from WriteTensorflowModel

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings$.html index 6147b7a23e1c45..0fc4c3dc4b08d3 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.CamemBertEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.CamemBertEmbeddings + + @@ -28,7 +28,7 @@ @@ -439,7 +463,7 @@

    Inherited
      -
    1. DeBertaEmbeddings
    2. Serializable
    3. Serializable
    4. ReadDeBertaDLModel
    5. ReadSentencePieceModel
    6. ReadTensorflowModel
    7. ReadablePretrainedDeBertaModel
    8. HasPretrained
    9. ParamsAndFeaturesReadable
    10. DefaultParamsReadable
    11. MLReadable
    12. AnyRef
    13. Any
    14. +
    15. DeBertaEmbeddings
    16. Serializable
    17. Serializable
    18. ReadDeBertaDLModel
    19. ReadOnnxModel
    20. ReadSentencePieceModel
    21. ReadTensorflowModel
    22. ReadablePretrainedDeBertaModel
    23. HasPretrained
    24. ParamsAndFeaturesReadable
    25. DefaultParamsReadable
    26. MLReadable
    27. AnyRef
    28. Any

    @@ -818,6 +842,22 @@

    Value Members

    @native()
    +
  • + + + + + + + + + val + + + onnxFile: String + + +
    Definition Classes
    ReadDeBertaDLModelReadOnnxModel
  • @@ -914,6 +954,22 @@

    Value Members

    Definition Classes
    ReadDeBertaDLModel
    +
  • + + + + + + + + + def + + + readOnnxModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, sessionOptions: Option[SessionOptions] = None): OnnxWrapper + + +
    Definition Classes
    ReadOnnxModel
  • @@ -1142,6 +1198,8 @@

    Inherited from SerializableInherited from Serializable

    Inherited from ReadDeBertaDLModel

    +
    +

    Inherited from ReadOnnxModel

    Inherited from ReadSentencePieceModel

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/DeBertaEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/DeBertaEmbeddings.html index 61a9418412a10a..8fb4f119ef6300 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/DeBertaEmbeddings.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/DeBertaEmbeddings.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.DeBertaEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.DeBertaEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • @@ -402,7 +426,7 @@

    class - DeBertaEmbeddings extends AnnotatorModel[DeBertaEmbeddings] with HasBatchedAnnotate[DeBertaEmbeddings] with WriteTensorflowModel with WriteSentencePieceModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine + DeBertaEmbeddings extends AnnotatorModel[DeBertaEmbeddings] with HasBatchedAnnotate[DeBertaEmbeddings] with WriteTensorflowModel with WriteOnnxModel with WriteSentencePieceModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine

    @@ -480,7 +504,7 @@

    Linear Supertypes - + @@ -506,7 +530,7 @@

    Inherited
      -
    1. DeBertaEmbeddings
    2. HasEngine
    3. HasCaseSensitiveProperties
    4. HasStorageRef
    5. HasEmbeddingsProperties
    6. HasProtectedParams
    7. WriteSentencePieceModel
    8. WriteTensorflowModel
    9. HasBatchedAnnotate
    10. AnnotatorModel
    11. CanBeLazy
    12. RawAnnotator
    13. HasOutputAnnotationCol
    14. HasInputAnnotationCols
    15. HasOutputAnnotatorType
    16. ParamsAndFeaturesWritable
    17. HasFeatures
    18. DefaultParamsWritable
    19. MLWritable
    20. Model
    21. Transformer
    22. PipelineStage
    23. Logging
    24. Params
    25. Serializable
    26. Serializable
    27. Identifiable
    28. AnyRef
    29. Any
    30. +
    31. DeBertaEmbeddings
    32. HasEngine
    33. HasCaseSensitiveProperties
    34. HasStorageRef
    35. HasEmbeddingsProperties
    36. HasProtectedParams
    37. WriteSentencePieceModel
    38. WriteOnnxModel
    39. WriteTensorflowModel
    40. HasBatchedAnnotate
    41. AnnotatorModel
    42. CanBeLazy
    43. RawAnnotator
    44. HasOutputAnnotationCol
    45. HasInputAnnotationCols
    46. HasOutputAnnotatorType
    47. ParamsAndFeaturesWritable
    48. HasFeatures
    49. DefaultParamsWritable
    50. MLWritable
    51. Model
    52. Transformer
    53. PipelineStage
    54. Logging
    55. Params
    56. Serializable
    57. Serializable
    58. Identifiable
    59. AnyRef
    60. Any
    @@ -2552,9 +2576,9 @@

    Value Members

  • - + - + @@ -2563,7 +2587,7 @@

    Value Members

    def - setModelIfNotSet(spark: SparkSession, tensorflowWrapper: TensorflowWrapper, spp: SentencePieceWrapper): DeBertaEmbeddings + setModelIfNotSet(spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): DeBertaEmbeddings

    @@ -2987,6 +3011,22 @@

    Value Members

    Definition Classes
    ParamsAndFeaturesWritable → DefaultParamsWritable → MLWritable
    +
  • + + + + + + + + + def + + + writeOnnxModel(path: String, spark: SparkSession, onnxWrapper: OnnxWrapper, suffix: String, fileName: String): Unit + + +
    Definition Classes
    WriteOnnxModel
  • @@ -3073,6 +3113,8 @@

    Inherited from HasProtectedParams

    Inherited from WriteSentencePieceModel

    +
    +

    Inherited from WriteOnnxModel

    Inherited from WriteTensorflowModel

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings$.html index ecfeab00ba476d..430c2042f408d6 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.DistilBertEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.DistilBertEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • @@ -413,7 +437,7 @@

    Linear Supertypes - + @@ -439,7 +463,7 @@

    Inherited
      -
    1. DistilBertEmbeddings
    2. Serializable
    3. Serializable
    4. ReadDistilBertDLModel
    5. ReadTensorflowModel
    6. ReadablePretrainedDistilBertModel
    7. HasPretrained
    8. ParamsAndFeaturesReadable
    9. DefaultParamsReadable
    10. MLReadable
    11. AnyRef
    12. Any
    13. +
    14. DistilBertEmbeddings
    15. Serializable
    16. Serializable
    17. ReadDistilBertDLModel
    18. ReadOnnxModel
    19. ReadTensorflowModel
    20. ReadablePretrainedDistilBertModel
    21. HasPretrained
    22. ParamsAndFeaturesReadable
    23. DefaultParamsReadable
    24. MLReadable
    25. AnyRef
    26. Any
    @@ -818,6 +842,22 @@

    Value Members

    @native()
    +

  • + + + + + + + + + val + + + onnxFile: String + + +
    Definition Classes
    ReadDistilBertDLModelReadOnnxModel
  • @@ -914,6 +954,22 @@

    Value Members

    Definition Classes
    ReadDistilBertDLModel
    +
  • + + + + + + + + + def + + + readOnnxModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, sessionOptions: Option[SessionOptions] = None): OnnxWrapper + + +
    Definition Classes
    ReadOnnxModel
  • @@ -1110,6 +1166,8 @@

    Inherited from SerializableInherited from Serializable

    Inherited from ReadDistilBertDLModel

    +
    +

    Inherited from ReadOnnxModel

    Inherited from ReadTensorflowModel

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings.html index 643c5871276fc0..732373291191a3 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.DistilBertEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.DistilBertEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • @@ -402,7 +426,7 @@

    class - DistilBertEmbeddings extends AnnotatorModel[DistilBertEmbeddings] with HasBatchedAnnotate[DistilBertEmbeddings] with WriteTensorflowModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine + DistilBertEmbeddings extends AnnotatorModel[DistilBertEmbeddings] with HasBatchedAnnotate[DistilBertEmbeddings] with WriteTensorflowModel with WriteOnnxModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine

    @@ -489,7 +513,7 @@

    Linear Supertypes - + @@ -515,7 +539,7 @@

    Inherited
      -
    1. DistilBertEmbeddings
    2. HasEngine
    3. HasCaseSensitiveProperties
    4. HasStorageRef
    5. HasEmbeddingsProperties
    6. HasProtectedParams
    7. WriteTensorflowModel
    8. HasBatchedAnnotate
    9. AnnotatorModel
    10. CanBeLazy
    11. RawAnnotator
    12. HasOutputAnnotationCol
    13. HasInputAnnotationCols
    14. HasOutputAnnotatorType
    15. ParamsAndFeaturesWritable
    16. HasFeatures
    17. DefaultParamsWritable
    18. MLWritable
    19. Model
    20. Transformer
    21. PipelineStage
    22. Logging
    23. Params
    24. Serializable
    25. Serializable
    26. Identifiable
    27. AnyRef
    28. Any
    29. +
    30. DistilBertEmbeddings
    31. HasEngine
    32. HasCaseSensitiveProperties
    33. HasStorageRef
    34. HasEmbeddingsProperties
    35. HasProtectedParams
    36. WriteOnnxModel
    37. WriteTensorflowModel
    38. HasBatchedAnnotate
    39. AnnotatorModel
    40. CanBeLazy
    41. RawAnnotator
    42. HasOutputAnnotationCol
    43. HasInputAnnotationCols
    44. HasOutputAnnotatorType
    45. ParamsAndFeaturesWritable
    46. HasFeatures
    47. DefaultParamsWritable
    48. MLWritable
    49. Model
    50. Transformer
    51. PipelineStage
    52. Logging
    53. Params
    54. Serializable
    55. Serializable
    56. Identifiable
    57. AnyRef
    58. Any
    @@ -2595,9 +2619,9 @@

    Value Members

  • - + - + @@ -2606,7 +2630,7 @@

    Value Members

    def - setModelIfNotSet(spark: SparkSession, tensorflowWrapper: TensorflowWrapper): DistilBertEmbeddings + setModelIfNotSet(spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper]): DistilBertEmbeddings

    @@ -3079,6 +3103,22 @@

    Value Members

    Definition Classes
    ParamsAndFeaturesWritable → DefaultParamsWritable → MLWritable
    +
  • + + + + + + + + + def + + + writeOnnxModel(path: String, spark: SparkSession, onnxWrapper: OnnxWrapper, suffix: String, fileName: String): Unit + + +
    Definition Classes
    WriteOnnxModel
  • @@ -3147,6 +3187,8 @@

    Inherited from Inherited from HasEmbeddingsProperties

    Inherited from HasProtectedParams

    +
    +

    Inherited from WriteOnnxModel

    Inherited from WriteTensorflowModel

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecApproach$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecApproach$.html index a6ad547754e492..d7acdb8b9df849 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecApproach$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecApproach$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Doc2VecApproach - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Doc2VecApproach + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecApproach.html b/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecApproach.html index 356d3e537c90b5..8bdb56051c435e 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecApproach.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecApproach.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Doc2VecApproach - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Doc2VecApproach + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecModel$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecModel$.html index a532a148eb1d01..82a6951b1f18eb 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecModel$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecModel$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Doc2VecModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Doc2VecModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecModel.html index 6dce56e5ee7e70..0ba1097cf8d5ee 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/Doc2VecModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Doc2VecModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Doc2VecModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/E5Embeddings$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/E5Embeddings$.html new file mode 100644 index 00000000000000..0e9efd7eb6a808 --- /dev/null +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/E5Embeddings$.html @@ -0,0 +1,1174 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.E5Embeddings + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + + + +

    + + + object + + + E5Embeddings extends ReadablePretrainedE5Model with ReadE5DLModel with Serializable + +

    + + +

    This is the companion object of E5Embeddings. Please refer to that class for the +documentation. +

    + + Linear Supertypes + + +
    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. E5Embeddings
    2. Serializable
    3. Serializable
    4. ReadE5DLModel
    5. ReadTensorflowModel
    6. ReadablePretrainedE5Model
    7. HasPretrained
    8. ParamsAndFeaturesReadable
    9. DefaultParamsReadable
    10. MLReadable
    11. AnyRef
    12. Any
    13. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    + + + + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + + + + + + + + def + + + addReader(reader: (E5Embeddings, String, SparkSession) ⇒ Unit): Unit + + +
      Definition Classes
      ParamsAndFeaturesReadable
      +
    5. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    6. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    7. + + + + + + + + + val + + + defaultLang: String + + +
      Definition Classes
      HasPretrained
      +
    8. + + + + + + + + + lazy val + + + defaultLoc: String + + +
      Definition Classes
      HasPretrained
      +
    9. + + + + + + + + + val + + + defaultModelName: Some[String] + + + +
    10. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    11. + + + + + + + + + def + + + equals(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    12. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    13. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    14. + + + + + + + + + def + + + hashCode(): Int + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    15. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    16. + + + + + + + + + def + + + load(path: String): E5Embeddings + + +
      Definition Classes
      MLReadable
      Annotations
      + @Since( + + "1.6.0" + ) + +
      +
    17. + + + + + + + + + def + + + loadSavedModel(modelPath: String, spark: SparkSession): E5Embeddings + + +
      Definition Classes
      ReadE5DLModel
      +
    18. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    19. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    20. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    21. + + + + + + + + + def + + + pretrained(name: String, lang: String, remoteLoc: String): E5Embeddings + + +

      Java default argument interoperability

      Java default argument interoperability

      Definition Classes
      ReadablePretrainedE5ModelHasPretrained
      +
    22. + + + + + + + + + def + + + pretrained(name: String, lang: String): E5Embeddings + + + +
    23. + + + + + + + + + def + + + pretrained(name: String): E5Embeddings + + + +
    24. + + + + + + + + + def + + + pretrained(): E5Embeddings + + +

      Java compliant-overrides

      Java compliant-overrides

      Definition Classes
      ReadablePretrainedE5ModelHasPretrained
      +
    25. + + + + + + + + + def + + + read: MLReader[E5Embeddings] + + +
      Definition Classes
      ParamsAndFeaturesReadable → DefaultParamsReadable → MLReadable
      +
    26. + + + + + + + + + def + + + readModel(instance: E5Embeddings, path: String, spark: SparkSession): Unit + + +
      Definition Classes
      ReadE5DLModel
      +
    27. + + + + + + + + + def + + + readTensorflowChkPoints(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, tags: Array[String] = Array.empty, initAllTables: Boolean = false): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    28. + + + + + + + + + def + + + readTensorflowHub(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, tags: Array[String] = Array.empty): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    29. + + + + + + + + + def + + + readTensorflowModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, tags: Array[String] = Array.empty, initAllTables: Boolean = false, savedSignatures: Option[Map[String, String]] = None): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    30. + + + + + + + + + def + + + readTensorflowWithSPModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, tags: Array[String] = Array.empty, initAllTables: Boolean = false, loadSP: Boolean = false): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    31. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    32. + + + + + + + + + val + + + tfFile: String + + +
      Definition Classes
      ReadE5DLModelReadTensorflowModel
      +
    33. + + + + + + + + + def + + + toString(): String + + +
      Definition Classes
      AnyRef → Any
      +
    34. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    35. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    36. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    37. +
    +
    + + + + +
    + +
    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Serializable

    +
    +

    Inherited from ReadE5DLModel

    +
    +

    Inherited from ReadTensorflowModel

    +
    +

    Inherited from ReadablePretrainedE5Model

    +
    +

    Inherited from HasPretrained[E5Embeddings]

    +
    +

    Inherited from DefaultParamsReadable[E5Embeddings]

    +
    +

    Inherited from MLReadable[E5Embeddings]

    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/E5Embeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/E5Embeddings.html new file mode 100644 index 00000000000000..100579ebf5f2b5 --- /dev/null +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/E5Embeddings.html @@ -0,0 +1,3228 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.E5Embeddings + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + + + +

    + + + class + + + E5Embeddings extends AnnotatorModel[E5Embeddings] with HasBatchedAnnotate[E5Embeddings] with WriteTensorflowModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine + +

    + + +

    Sentence embeddings using E5.

    E5, an instruction-finetuned text embedding model that can generate text embeddings tailored +to any task (e.g., classification, retrieval, clustering, text evaluation, etc.)

    Pretrained models can be loaded with pretrained of the companion object:

    val embeddings = E5Embeddings.pretrained()
    +  .setInputCols("document")
    +  .setOutputCol("e5_embeddings")

    The default model is "e5_small", if no name is provided.

    For available pretrained models please see the +Models Hub.

    For extended examples of usage, see +E5EmbeddingsTestSpec.

    Sources :

    Text Embeddings by Weakly-Supervised Contrastive Pre-training

    E5 Github Repository

    Paper abstract

    This paper presents E5, a family of state-of-the-art text embeddings that transfer well to a +wide range of tasks. The model is trained in a contrastive manner with weak supervision +signals from our curated large-scale text pair dataset (called CCPairs). E5 can be readily +used as a general-purpose embedding model for any tasks requiring a single-vector +representation of texts such as retrieval, clustering, and classification, achieving strong +performance in both zero-shot and fine-tuned settings. We conduct extensive evaluations on 56 +datasets from the BEIR and MTEB benchmarks. For zero-shot settings, E5 is the first model that +outperforms the strong BM25 baseline on the BEIR retrieval benchmark without using any labeled +data. When fine-tuned, E5 obtains the best results on the MTEB benchmark, beating existing +embedding models with 40× more parameters.

    Example

    import spark.implicits._
    +import com.johnsnowlabs.nlp.base.DocumentAssembler
    +import com.johnsnowlabs.nlp.annotators.Tokenizer
    +import com.johnsnowlabs.nlp.embeddings.E5Embeddings
    +import com.johnsnowlabs.nlp.EmbeddingsFinisher
    +import org.apache.spark.ml.Pipeline
    +
    +val documentAssembler = new DocumentAssembler()
    +  .setInputCol("text")
    +  .setOutputCol("document")
    +
    +val embeddings = E5Embeddings.pretrained("e5_small", "en")
    +  .setInputCols("document")
    +  .setOutputCol("e5_embeddings")
    +
    +val embeddingsFinisher = new EmbeddingsFinisher()
    +  .setInputCols("e5_embeddings")
    +  .setOutputCols("finished_embeddings")
    +  .setOutputAsVector(true)
    +
    +val pipeline = new Pipeline().setStages(Array(
    +  documentAssembler,
    +  embeddings,
    +  embeddingsFinisher
    +))
    +
    +val data = Seq("query: how much protein should a female eat",
    +"passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." +
    +But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" +
    +marathon. Check out the chart below to see how much protein you should be eating each day."
    +
    +).toDF("text")
    +val result = pipeline.fit(data).transform(data)
    +
    +result.selectExpr("explode(finished_embeddings) as result").show(1, 80)
    ++--------------------------------------------------------------------------------+
    +|                                                                          result|
    ++--------------------------------------------------------------------------------+
    +|[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...|
    +[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...|
    ++--------------------------------------------------------------------------------+
    See also

    + Annotators Main Page for a list of transformer + based embeddings

    + + Linear Supertypes + + +
    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      +
    1. Grouped
    2. +
    3. Alphabetic
    4. +
    5. By Inheritance
    6. +
    +
    +
    + Inherited
    +
    +
      +
    1. E5Embeddings
    2. HasEngine
    3. HasCaseSensitiveProperties
    4. HasStorageRef
    5. HasEmbeddingsProperties
    6. HasProtectedParams
    7. WriteTensorflowModel
    8. HasBatchedAnnotate
    9. AnnotatorModel
    10. CanBeLazy
    11. RawAnnotator
    12. HasOutputAnnotationCol
    13. HasInputAnnotationCols
    14. HasOutputAnnotatorType
    15. ParamsAndFeaturesWritable
    16. HasFeatures
    17. DefaultParamsWritable
    18. MLWritable
    19. Model
    20. Transformer
    21. PipelineStage
    22. Logging
    23. Params
    24. Serializable
    25. Serializable
    26. Identifiable
    27. AnyRef
    28. Any
    29. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + + + + + + + + new + + + E5Embeddings() + + + +
    2. + + + + + + + + + new + + + E5Embeddings(uid: String) + + +

      uid

      + required uid for storing annotator to disk

      +
    +
    + +
    +

    Type Members

    +
    1. + + + + + + + + implicit + class + + + ProtectedParam[T] extends Param[T] + + +
      Definition Classes
      HasProtectedParams
      +
    2. + + + + + + + + + type + + + AnnotationContent = Seq[Row] + + +

      internal types to show Rows as a relevant StructType Should be deleted once Spark releases +UserDefinedTypes to @developerAPI +

      internal types to show Rows as a relevant StructType Should be deleted once Spark releases +UserDefinedTypes to @developerAPI +

      Attributes
      protected
      Definition Classes
      AnnotatorModel
      +
    3. + + + + + + + + + type + + + AnnotatorType = String + + +
      Definition Classes
      HasOutputAnnotatorType
      +
    +
    + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + $[T](param: Param[T]): T + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    4. + + + + + + + + + def + + + $$[T](feature: StructFeature[T]): T + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    5. + + + + + + + + + def + + + $$[K, V](feature: MapFeature[K, V]): Map[K, V] + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    6. + + + + + + + + + def + + + $$[T](feature: SetFeature[T]): Set[T] + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    7. + + + + + + + + + def + + + $$[T](feature: ArrayFeature[T]): Array[T] + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    8. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    9. + + + + + + + + + def + + + _transform(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): DataFrame + + +
      Attributes
      protected
      Definition Classes
      AnnotatorModel
      +
    10. + + + + + + + + + def + + + afterAnnotate(dataset: DataFrame): DataFrame + + +
      Attributes
      protected
      Definition Classes
      E5EmbeddingsAnnotatorModel
      +
    11. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    12. + + + + + + + + + def + + + batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] + + +

      takes a document and annotations and produces new annotations of this annotator's annotation +type +

      takes a document and annotations and produces new annotations of this annotator's annotation +type +

      batchedAnnotations

      + Annotations that correspond to inputAnnotationCols generated by previous annotators if any

      returns

      + any number of annotations processed for every input annotation. Not necessary one to one + relationship

      Definition Classes
      E5EmbeddingsHasBatchedAnnotate
      +
    13. + + + + + + + + + def + + + batchProcess(rows: Iterator[_]): Iterator[Row] + + +
      Definition Classes
      HasBatchedAnnotate
      +
    14. + + + + + + + + + val + + + batchSize: IntParam + + +

      Size of every batch (Default depends on model).

      Size of every batch (Default depends on model). +

      Definition Classes
      HasBatchedAnnotate
      +
    15. + + + + + + + + + def + + + beforeAnnotate(dataset: Dataset[_]): Dataset[_] + + +
      Attributes
      protected
      Definition Classes
      AnnotatorModel
      +
    16. + + + + + + + + + val + + + caseSensitive: BooleanParam + + +

      Whether to ignore case in index lookups (Default depends on model) +

      Whether to ignore case in index lookups (Default depends on model) +

      Definition Classes
      HasCaseSensitiveProperties
      +
    17. + + + + + + + + final + def + + + checkSchema(schema: StructType, inputAnnotatorType: String): Boolean + + +
      Attributes
      protected
      Definition Classes
      HasInputAnnotationCols
      +
    18. + + + + + + + + final + def + + + clear(param: Param[_]): E5Embeddings.this.type + + +
      Definition Classes
      Params
      +
    19. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    20. + + + + + + + + + val + + + configProtoBytes: IntArrayParam + + +

      ConfigProto from tensorflow, serialized into byte array.

      ConfigProto from tensorflow, serialized into byte array. Get with +config_proto.SerializeToString() +

      +
    21. + + + + + + + + + def + + + copy(extra: ParamMap): E5Embeddings + + +

      requirement for annotators copies

      requirement for annotators copies

      Definition Classes
      RawAnnotator → Model → Transformer → PipelineStage → Params
      +
    22. + + + + + + + + + def + + + copyValues[T <: Params](to: T, extra: ParamMap): T + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    23. + + + + + + + + + def + + + createDatabaseConnection(database: Name): RocksDBConnection + + +
      Definition Classes
      HasStorageRef
      +
    24. + + + + + + + + final + def + + + defaultCopy[T <: Params](extra: ParamMap): T + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    25. + + + + + + + + + val + + + dimension: ProtectedParam[Int] + + +

      Number of embedding dimensions (Default depends on model) +

      Number of embedding dimensions (Default depends on model) +

      Definition Classes
      HasEmbeddingsProperties
      +
    26. + + + + + + + + + val + + + engine: Param[String] + + +

      This param is set internally once via loadSavedModel.

      This param is set internally once via loadSavedModel. That's why there is no setter +

      Definition Classes
      HasEngine
      +
    27. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    28. + + + + + + + + + def + + + equals(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    29. + + + + + + + + + def + + + explainParam(param: Param[_]): String + + +
      Definition Classes
      Params
      +
    30. + + + + + + + + + def + + + explainParams(): String + + +
      Definition Classes
      Params
      +
    31. + + + + + + + + + def + + + extraValidate(structType: StructType): Boolean + + +
      Attributes
      protected
      Definition Classes
      RawAnnotator
      +
    32. + + + + + + + + + def + + + extraValidateMsg: String + + +

      Override for additional custom schema checks

      Override for additional custom schema checks

      Attributes
      protected
      Definition Classes
      RawAnnotator
      +
    33. + + + + + + + + final + def + + + extractParamMap(): ParamMap + + +
      Definition Classes
      Params
      +
    34. + + + + + + + + final + def + + + extractParamMap(extra: ParamMap): ParamMap + + +
      Definition Classes
      Params
      +
    35. + + + + + + + + + val + + + features: ArrayBuffer[Feature[_, _, _]] + + +
      Definition Classes
      HasFeatures
      +
    36. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    37. + + + + + + + + + def + + + get[T](feature: StructFeature[T]): Option[T] + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    38. + + + + + + + + + def + + + get[K, V](feature: MapFeature[K, V]): Option[Map[K, V]] + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    39. + + + + + + + + + def + + + get[T](feature: SetFeature[T]): Option[Set[T]] + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    40. + + + + + + + + + def + + + get[T](feature: ArrayFeature[T]): Option[Array[T]] + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    41. + + + + + + + + final + def + + + get[T](param: Param[T]): Option[T] + + +
      Definition Classes
      Params
      +
    42. + + + + + + + + + def + + + getBatchSize: Int + + +

      Size of every batch.

      Size of every batch. +

      Definition Classes
      HasBatchedAnnotate
      +
    43. + + + + + + + + + def + + + getCaseSensitive: Boolean + + +

      Definition Classes
      HasCaseSensitiveProperties
      +
    44. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    45. + + + + + + + + + def + + + getConfigProtoBytes: Option[Array[Byte]] + + +

      +
    46. + + + + + + + + final + def + + + getDefault[T](param: Param[T]): Option[T] + + +
      Definition Classes
      Params
      +
    47. + + + + + + + + + def + + + getDimension: Int + + +

      Definition Classes
      HasEmbeddingsProperties
      +
    48. + + + + + + + + + def + + + getEngine: String + + +

      Definition Classes
      HasEngine
      +
    49. + + + + + + + + + def + + + getInputCols: Array[String] + + +

      returns

      input annotations columns currently used

      Definition Classes
      HasInputAnnotationCols
      +
    50. + + + + + + + + + def + + + getLazyAnnotator: Boolean + + +
      Definition Classes
      CanBeLazy
      +
    51. + + + + + + + + + def + + + getMaxSentenceLength: Int + + +

      +
    52. + + + + + + + + + def + + + getModelIfNotSet: E5 + + +

      +
    53. + + + + + + + + final + def + + + getOrDefault[T](param: Param[T]): T + + +
      Definition Classes
      Params
      +
    54. + + + + + + + + final + def + + + getOutputCol: String + + +

      Gets annotation column name going to generate

      Gets annotation column name going to generate

      Definition Classes
      HasOutputAnnotationCol
      +
    55. + + + + + + + + + def + + + getParam(paramName: String): Param[Any] + + +
      Definition Classes
      Params
      +
    56. + + + + + + + + + def + + + getSignatures: Option[Map[String, String]] + + +

      +
    57. + + + + + + + + + def + + + getStorageRef: String + + +
      Definition Classes
      HasStorageRef
      +
    58. + + + + + + + + final + def + + + hasDefault[T](param: Param[T]): Boolean + + +
      Definition Classes
      Params
      +
    59. + + + + + + + + + def + + + hasParam(paramName: String): Boolean + + +
      Definition Classes
      Params
      +
    60. + + + + + + + + + def + + + hasParent: Boolean + + +
      Definition Classes
      Model
      +
    61. + + + + + + + + + def + + + hashCode(): Int + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    62. + + + + + + + + + def + + + initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    63. + + + + + + + + + def + + + initializeLogIfNecessary(isInterpreter: Boolean): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    64. + + + + + + + + + val + + + inputAnnotatorTypes: Array[String] + + +

      Annotator reference id.

      Annotator reference id. Used to identify elements in metadata or to refer to this annotator +type +

      Definition Classes
      E5EmbeddingsHasInputAnnotationCols
      +
    65. + + + + + + + + final + val + + + inputCols: StringArrayParam + + +

      columns that contain annotations necessary to run this annotator AnnotatorType is used both +as input and output columns if not specified +

      columns that contain annotations necessary to run this annotator AnnotatorType is used both +as input and output columns if not specified +

      Attributes
      protected
      Definition Classes
      HasInputAnnotationCols
      +
    66. + + + + + + + + final + def + + + isDefined(param: Param[_]): Boolean + + +
      Definition Classes
      Params
      +
    67. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    68. + + + + + + + + final + def + + + isSet(param: Param[_]): Boolean + + +
      Definition Classes
      Params
      +
    69. + + + + + + + + + def + + + isTraceEnabled(): Boolean + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    70. + + + + + + + + + val + + + lazyAnnotator: BooleanParam + + +
      Definition Classes
      CanBeLazy
      +
    71. + + + + + + + + + def + + + log: Logger + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    72. + + + + + + + + + def + + + logDebug(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    73. + + + + + + + + + def + + + logDebug(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    74. + + + + + + + + + def + + + logError(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    75. + + + + + + + + + def + + + logError(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    76. + + + + + + + + + def + + + logInfo(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    77. + + + + + + + + + def + + + logInfo(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    78. + + + + + + + + + def + + + logName: String + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    79. + + + + + + + + + def + + + logTrace(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    80. + + + + + + + + + def + + + logTrace(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    81. + + + + + + + + + def + + + logWarning(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    82. + + + + + + + + + def + + + logWarning(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    83. + + + + + + + + + val + + + maxSentenceLength: IntParam + + +

      Max sentence length to process (Default: 128) +

      +
    84. + + + + + + + + + def + + + msgHelper(schema: StructType): String + + +
      Attributes
      protected
      Definition Classes
      HasInputAnnotationCols
      +
    85. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    86. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    87. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    88. + + + + + + + + + def + + + onWrite(path: String, spark: SparkSession): Unit + + + +
    89. + + + + + + + + + val + + + optionalInputAnnotatorTypes: Array[String] + + +
      Definition Classes
      HasInputAnnotationCols
      +
    90. + + + + + + + + + val + + + outputAnnotatorType: AnnotatorType + + +
      Definition Classes
      E5EmbeddingsHasOutputAnnotatorType
      +
    91. + + + + + + + + final + val + + + outputCol: Param[String] + + +
      Attributes
      protected
      Definition Classes
      HasOutputAnnotationCol
      +
    92. + + + + + + + + + lazy val + + + params: Array[Param[_]] + + +
      Definition Classes
      Params
      +
    93. + + + + + + + + + var + + + parent: Estimator[E5Embeddings] + + +
      Definition Classes
      Model
      +
    94. + + + + + + + + + def + + + save(path: String): Unit + + +
      Definition Classes
      MLWritable
      Annotations
      + @Since( + + "1.6.0" + ) + + @throws( + + ... + ) + +
      +
    95. + + + + + + + + + def + + + sentenceEndTokenId: Int + + +

      +
    96. + + + + + + + + + def + + + sentenceStartTokenId: Int + + + +
    97. + + + + + + + + + def + + + set[T](param: ProtectedParam[T], value: T): E5Embeddings.this.type + + +

      Sets the value for a protected Param.

      Sets the value for a protected Param.

      If the parameter was already set, it will not be set again. Default values do not count as a +set value and can be overridden. +

      T

      + Type of the parameter

      param

      + Protected parameter to set

      value

      + Value for the parameter

      returns

      + This object

      Definition Classes
      HasProtectedParams
      +
    98. + + + + + + + + + def + + + set[T](feature: StructFeature[T], value: T): E5Embeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    99. + + + + + + + + + def + + + set[K, V](feature: MapFeature[K, V], value: Map[K, V]): E5Embeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    100. + + + + + + + + + def + + + set[T](feature: SetFeature[T], value: Set[T]): E5Embeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    101. + + + + + + + + + def + + + set[T](feature: ArrayFeature[T], value: Array[T]): E5Embeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    102. + + + + + + + + final + def + + + set(paramPair: ParamPair[_]): E5Embeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    103. + + + + + + + + final + def + + + set(param: String, value: Any): E5Embeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    104. + + + + + + + + final + def + + + set[T](param: Param[T], value: T): E5Embeddings.this.type + + +
      Definition Classes
      Params
      +
    105. + + + + + + + + + def + + + setBatchSize(size: Int): E5Embeddings.this.type + + +

      Size of every batch.

      Size of every batch. +

      Definition Classes
      HasBatchedAnnotate
      +
    106. + + + + + + + + + def + + + setCaseSensitive(value: Boolean): E5Embeddings.this.type + + +

      Whether to lowercase tokens or not +

      Whether to lowercase tokens or not +

      Definition Classes
      E5EmbeddingsHasCaseSensitiveProperties
      +
    107. + + + + + + + + + def + + + setConfigProtoBytes(bytes: Array[Int]): E5Embeddings.this.type + + +

      +
    108. + + + + + + + + + def + + + setDefault[T](feature: StructFeature[T], value: () ⇒ T): E5Embeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    109. + + + + + + + + + def + + + setDefault[K, V](feature: MapFeature[K, V], value: () ⇒ Map[K, V]): E5Embeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    110. + + + + + + + + + def + + + setDefault[T](feature: SetFeature[T], value: () ⇒ Set[T]): E5Embeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    111. + + + + + + + + + def + + + setDefault[T](feature: ArrayFeature[T], value: () ⇒ Array[T]): E5Embeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    112. + + + + + + + + final + def + + + setDefault(paramPairs: ParamPair[_]*): E5Embeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    113. + + + + + + + + final + def + + + setDefault[T](param: Param[T], value: T): E5Embeddings.this.type + + +
      Attributes
      protected[org.apache.spark.ml]
      Definition Classes
      Params
      +
    114. + + + + + + + + + def + + + setDimension(value: Int): E5Embeddings.this.type + + +

      Set Embeddings dimensions for the BERT model Only possible to set this when the first time +is saved dimension is not changeable, it comes from BERT config file +

      Set Embeddings dimensions for the BERT model Only possible to set this when the first time +is saved dimension is not changeable, it comes from BERT config file +

      Definition Classes
      E5EmbeddingsHasEmbeddingsProperties
      +
    115. + + + + + + + + final + def + + + setInputCols(value: String*): E5Embeddings.this.type + + +
      Definition Classes
      HasInputAnnotationCols
      +
    116. + + + + + + + + + def + + + setInputCols(value: Array[String]): E5Embeddings.this.type + + +

      Overrides required annotators column if different than default

      Overrides required annotators column if different than default

      Definition Classes
      HasInputAnnotationCols
      +
    117. + + + + + + + + + def + + + setLazyAnnotator(value: Boolean): E5Embeddings.this.type + + +
      Definition Classes
      CanBeLazy
      +
    118. + + + + + + + + + def + + + setMaxSentenceLength(value: Int): E5Embeddings.this.type + + +

      +
    119. + + + + + + + + + def + + + setModelIfNotSet(spark: SparkSession, tensorflowWrapper: TensorflowWrapper): E5Embeddings + + +

      +
    120. + + + + + + + + final + def + + + setOutputCol(value: String): E5Embeddings.this.type + + +

      Overrides annotation column name when transforming

      Overrides annotation column name when transforming

      Definition Classes
      HasOutputAnnotationCol
      +
    121. + + + + + + + + + def + + + setParent(parent: Estimator[E5Embeddings]): E5Embeddings + + +
      Definition Classes
      Model
      +
    122. + + + + + + + + + def + + + setSignatures(value: Map[String, String]): E5Embeddings.this.type + + +

      +
    123. + + + + + + + + + def + + + setStorageRef(value: String): E5Embeddings.this.type + + +
      Definition Classes
      HasStorageRef
      +
    124. + + + + + + + + + def + + + setVocabulary(value: Map[String, Int]): E5Embeddings.this.type + + +

      +
    125. + + + + + + + + + val + + + signatures: MapFeature[String, String] + + +

      It contains TF model signatures for the laded saved model +

      +
    126. + + + + + + + + + val + + + storageRef: Param[String] + + +

      Unique identifier for storage (Default: this.uid) +

      Unique identifier for storage (Default: this.uid) +

      Definition Classes
      HasStorageRef
      +
    127. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    128. + + + + + + + + + def + + + toString(): String + + +
      Definition Classes
      Identifiable → AnyRef → Any
      +
    129. + + + + + + + + + def + + + tokenize(sentences: Seq[Annotation]): Seq[WordpieceTokenizedSentence] + + + +
    130. + + + + + + + + final + def + + + transform(dataset: Dataset[_]): DataFrame + + +

      Given requirements are met, this applies ML transformation within a Pipeline or stand-alone +Output annotation will be generated as a new column, previous annotations are still +available separately metadata is built at schema level to record annotations structural +information outside its content +

      Given requirements are met, this applies ML transformation within a Pipeline or stand-alone +Output annotation will be generated as a new column, previous annotations are still +available separately metadata is built at schema level to record annotations structural +information outside its content +

      dataset

      + Dataset[Row]

      Definition Classes
      AnnotatorModel → Transformer
      +
    131. + + + + + + + + + def + + + transform(dataset: Dataset[_], paramMap: ParamMap): DataFrame + + +
      Definition Classes
      Transformer
      Annotations
      + @Since( + + "2.0.0" + ) + +
      +
    132. + + + + + + + + + def + + + transform(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): DataFrame + + +
      Definition Classes
      Transformer
      Annotations
      + @Since( + + "2.0.0" + ) + + @varargs() + +
      +
    133. + + + + + + + + final + def + + + transformSchema(schema: StructType): StructType + + +

      requirement for pipeline transformation validation.

      requirement for pipeline transformation validation. It is called on fit()

      Definition Classes
      RawAnnotator → PipelineStage
      +
    134. + + + + + + + + + def + + + transformSchema(schema: StructType, logging: Boolean): StructType + + +
      Attributes
      protected
      Definition Classes
      PipelineStage
      Annotations
      + @DeveloperApi() + +
      +
    135. + + + + + + + + + val + + + uid: String + + +
      Definition Classes
      E5Embeddings → Identifiable
      +
    136. + + + + + + + + + def + + + validate(schema: StructType): Boolean + + +

      takes a Dataset and checks to see if all the required annotation types are present.

      takes a Dataset and checks to see if all the required annotation types are present. +

      schema

      + to be validated

      returns

      + True if all the required types are present, else false

      Attributes
      protected
      Definition Classes
      RawAnnotator
      +
    137. + + + + + + + + + def + + + validateStorageRef(dataset: Dataset[_], inputCols: Array[String], annotatorType: String): Unit + + +
      Definition Classes
      HasStorageRef
      +
    138. + + + + + + + + + val + + + vocabulary: MapFeature[String, Int] + + +

      Vocabulary used to encode the words to ids with WordPieceEncoder +

      +
    139. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    140. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    141. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    142. + + + + + + + + + def + + + wrapColumnMetadata(col: Column): Column + + +
      Attributes
      protected
      Definition Classes
      RawAnnotator
      +
    143. + + + + + + + + + def + + + wrapEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column + + +
      Attributes
      protected
      Definition Classes
      HasEmbeddingsProperties
      +
    144. + + + + + + + + + def + + + wrapSentenceEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column + + +
      Attributes
      protected
      Definition Classes
      HasEmbeddingsProperties
      +
    145. + + + + + + + + + def + + + write: MLWriter + + +
      Definition Classes
      ParamsAndFeaturesWritable → DefaultParamsWritable → MLWritable
      +
    146. + + + + + + + + + def + + + writeTensorflowHub(path: String, tfPath: String, spark: SparkSession, suffix: String = "_use"): Unit + + +
      Definition Classes
      WriteTensorflowModel
      +
    147. + + + + + + + + + def + + + writeTensorflowModel(path: String, spark: SparkSession, tensorflow: TensorflowWrapper, suffix: String, filename: String, configProtoBytes: Option[Array[Byte]] = None): Unit + + +
      Definition Classes
      WriteTensorflowModel
      +
    148. + + + + + + + + + def + + + writeTensorflowModelV2(path: String, spark: SparkSession, tensorflow: TensorflowWrapper, suffix: String, filename: String, configProtoBytes: Option[Array[Byte]] = None, savedSignatures: Option[Map[String, String]] = None): Unit + + +
      Definition Classes
      WriteTensorflowModel
      +
    149. +
    +
    + + + + +
    + +
    +
    +

    Inherited from HasEngine

    +
    +

    Inherited from HasCaseSensitiveProperties

    +
    +

    Inherited from HasStorageRef

    +
    +

    Inherited from HasEmbeddingsProperties

    +
    +

    Inherited from HasProtectedParams

    +
    +

    Inherited from WriteTensorflowModel

    +
    +

    Inherited from HasBatchedAnnotate[E5Embeddings]

    +
    +

    Inherited from AnnotatorModel[E5Embeddings]

    +
    +

    Inherited from CanBeLazy

    +
    +

    Inherited from RawAnnotator[E5Embeddings]

    +
    +

    Inherited from HasOutputAnnotationCol

    +
    +

    Inherited from HasInputAnnotationCols

    +
    +

    Inherited from HasOutputAnnotatorType

    +
    +

    Inherited from ParamsAndFeaturesWritable

    +
    +

    Inherited from HasFeatures

    +
    +

    Inherited from DefaultParamsWritable

    +
    +

    Inherited from MLWritable

    +
    +

    Inherited from Model[E5Embeddings]

    +
    +

    Inherited from Transformer

    +
    +

    Inherited from PipelineStage

    +
    +

    Inherited from Logging

    +
    +

    Inherited from Params

    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Identifiable

    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Parameters

    +

    + A list of (hyper-)parameter keys this annotator can take. Users can set and get the + parameter values through setters and getters, respectively.

    +
    +

    Members

    + +
    +

    Parameter setters

    + +
    +

    Parameter getters

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ElmoEmbeddings$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ElmoEmbeddings$.html index fbd06e534086cc..b33aea25d2a848 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ElmoEmbeddings$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ElmoEmbeddings$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ElmoEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ElmoEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ElmoEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ElmoEmbeddings.html index 7495a578af12f2..17c01689e39253 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ElmoEmbeddings.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ElmoEmbeddings.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ElmoEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ElmoEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/EmbeddingsCoverage$CoverageResult.html b/docs/api/com/johnsnowlabs/nlp/embeddings/EmbeddingsCoverage$CoverageResult.html index d7c4b8fa46140e..b90549a0a4aeb4 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/EmbeddingsCoverage$CoverageResult.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/EmbeddingsCoverage$CoverageResult.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.EmbeddingsCoverage.CoverageResult - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.EmbeddingsCoverage.CoverageResult + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/HasEmbeddingsProperties.html b/docs/api/com/johnsnowlabs/nlp/embeddings/HasEmbeddingsProperties.html index cd5b8051c63d0e..d51f1f33ac1c92 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/HasEmbeddingsProperties.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/HasEmbeddingsProperties.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.HasEmbeddingsProperties - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.HasEmbeddingsProperties + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • @@ -414,7 +438,7 @@

    Known Subclasses - + diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddings$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddings$.html new file mode 100644 index 00000000000000..56f0df02107711 --- /dev/null +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddings$.html @@ -0,0 +1,1208 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.InstructorEmbeddings + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + + + +

    + + + object + + + InstructorEmbeddings extends ReadablePretrainedInstructorModel with ReadInstructorDLModel with ReadSentencePieceModel with Serializable + +

    + + +

    This is the companion object of InstructorEmbeddings. Please refer to that class for the +documentation. +

    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. InstructorEmbeddings
    2. Serializable
    3. Serializable
    4. ReadInstructorDLModel
    5. ReadSentencePieceModel
    6. ReadTensorflowModel
    7. ReadablePretrainedInstructorModel
    8. HasPretrained
    9. ParamsAndFeaturesReadable
    10. DefaultParamsReadable
    11. MLReadable
    12. AnyRef
    13. Any
    14. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    + + + + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + + + + + + + + def + + + addReader(reader: (InstructorEmbeddings, String, SparkSession) ⇒ Unit): Unit + + +
      Definition Classes
      ParamsAndFeaturesReadable
      +
    5. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    6. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    7. + + + + + + + + + val + + + defaultLang: String + + +
      Definition Classes
      HasPretrained
      +
    8. + + + + + + + + + lazy val + + + defaultLoc: String + + +
      Definition Classes
      HasPretrained
      +
    9. + + + + + + + + + val + + + defaultModelName: Some[String] + + + +
    10. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    11. + + + + + + + + + def + + + equals(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    12. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    13. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    14. + + + + + + + + + def + + + hashCode(): Int + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    15. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    16. + + + + + + + + + def + + + load(path: String): InstructorEmbeddings + + +
      Definition Classes
      MLReadable
      Annotations
      + @Since( + + "1.6.0" + ) + +
      +
    17. + + + + + + + + + def + + + loadSavedModel(modelPath: String, spark: SparkSession): InstructorEmbeddings + + +
      Definition Classes
      ReadInstructorDLModel
      +
    18. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    19. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    20. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    21. + + + + + + + + + def + + + pretrained(name: String, lang: String, remoteLoc: String): InstructorEmbeddings + + +

      Java default argument interoperability

      Java default argument interoperability

      Definition Classes
      ReadablePretrainedInstructorModelHasPretrained
      +
    22. + + + + + + + + + def + + + pretrained(name: String, lang: String): InstructorEmbeddings + + + +
    23. + + + + + + + + + def + + + pretrained(name: String): InstructorEmbeddings + + + +
    24. + + + + + + + + + def + + + pretrained(): InstructorEmbeddings + + +

      Java compliant-overrides

      Java compliant-overrides

      Definition Classes
      ReadablePretrainedInstructorModelHasPretrained
      +
    25. + + + + + + + + + def + + + read: MLReader[InstructorEmbeddings] + + +
      Definition Classes
      ParamsAndFeaturesReadable → DefaultParamsReadable → MLReadable
      +
    26. + + + + + + + + + def + + + readModel(instance: InstructorEmbeddings, path: String, spark: SparkSession): Unit + + +
      Definition Classes
      ReadInstructorDLModel
      +
    27. + + + + + + + + + def + + + readSentencePieceModel(path: String, spark: SparkSession, suffix: String, filename: String): SentencePieceWrapper + + +
      Definition Classes
      ReadSentencePieceModel
      +
    28. + + + + + + + + + def + + + readTensorflowChkPoints(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, tags: Array[String] = Array.empty, initAllTables: Boolean = false): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    29. + + + + + + + + + def + + + readTensorflowHub(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, tags: Array[String] = Array.empty): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    30. + + + + + + + + + def + + + readTensorflowModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, tags: Array[String] = Array.empty, initAllTables: Boolean = false, savedSignatures: Option[Map[String, String]] = None): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    31. + + + + + + + + + def + + + readTensorflowWithSPModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, tags: Array[String] = Array.empty, initAllTables: Boolean = false, loadSP: Boolean = false): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    32. + + + + + + + + + val + + + sppFile: String + + + +
    33. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    34. + + + + + + + + + val + + + tfFile: String + + + +
    35. + + + + + + + + + def + + + toString(): String + + +
      Definition Classes
      AnyRef → Any
      +
    36. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    37. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    38. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    39. +
    +
    + + + + +
    + +
    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Serializable

    +
    +

    Inherited from ReadInstructorDLModel

    +
    +

    Inherited from ReadSentencePieceModel

    +
    +

    Inherited from ReadTensorflowModel

    +
    +

    Inherited from HasPretrained[InstructorEmbeddings]

    +
    +

    Inherited from DefaultParamsReadable[InstructorEmbeddings]

    +
    +

    Inherited from MLReadable[InstructorEmbeddings]

    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddings.html new file mode 100644 index 00000000000000..fb9fab612e4224 --- /dev/null +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/InstructorEmbeddings.html @@ -0,0 +1,3199 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.InstructorEmbeddings + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + + + +

    + + + class + + + InstructorEmbeddings extends AnnotatorModel[InstructorEmbeddings] with HasBatchedAnnotate[InstructorEmbeddings] with WriteTensorflowModel with HasEmbeddingsProperties with HasStorageRef with WriteSentencePieceModel with HasCaseSensitiveProperties with HasEngine + +

    + + +

    Sentence embeddings using INSTRUCTOR.

    Instructor👨‍🏫, an instruction-finetuned text embedding model that can generate text +embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, +etc.) and domains (e.g., science, finance, etc.) by simply providing the task instruction, +without any finetuning. Instructor👨‍ achieves sota on 70 diverse embedding tasks!

    Pretrained models can be loaded with pretrained of the companion object:

    val embeddings = InstructorEmbeddings.pretrained()
    +  .setInputCols("document")
    +  .setOutputCol("instructor_embeddings")

    The default model is "instructor_base", if no name is provided.

    For available pretrained models please see the +Models Hub.

    For extended examples of usage, see +InstructorEmbeddingsTestSpec.

    Sources :

    One Embedder, Any Task: Instruction-Finetuned Text Embeddings

    INSTRUCTOR Github Repository

    Paper abstract

    We introduce INSTRUCTOR, a new method for computing text embeddings given task instructions: +every text input is embedded together with instructions explaining the use case (e.g., task +and domain descriptions). Unlike encoders from prior work that are more specialized, +INSTRUCTOR is a single embedder that can generate text embeddings tailored to different +downstream tasks and domains, without any further training. We first annotate instructions for +330 diverse tasks and train INSTRUCTOR on this multitask mixture with a contrastive loss. We +evaluate INSTRUCTOR on 70 embedding evaluation tasks (66 of which are unseen during training), +ranging from classification and information retrieval to semantic textual similarity and text +generation evaluation. INSTRUCTOR, while having an order of magnitude fewer parameters than +the previous best model, achieves state-of-the-art performance, with an average improvement of +3.4% compared to the previous best results on the 70 diverse datasets. Our analysis suggests +that INSTRUCTOR is robust to changes in instructions, and that instruction finetuning +mitigates the challenge of training a single model on diverse datasets. Our model, code, and +data are available at this https URL. https://instructor-embedding.github.io/

    Example

    import spark.implicits._
    +import com.johnsnowlabs.nlp.base.DocumentAssembler
    +import com.johnsnowlabs.nlp.annotators.Tokenizer
    +import com.johnsnowlabs.nlp.embeddings.InstructorEmbeddings
    +import com.johnsnowlabs.nlp.EmbeddingsFinisher
    +import org.apache.spark.ml.Pipeline
    +
    +val documentAssembler = new DocumentAssembler()
    +  .setInputCol("text")
    +  .setOutputCol("document")
    +
    +val embeddings = InstructorEmbeddings.pretrained("instructor_base", "en")
    +  .setInputCols("document")
    +  .setInstruction("Represent the Medicine sentence for clustering: ")
    +  .setOutputCol("instructor_embeddings")
    +
    +val embeddingsFinisher = new EmbeddingsFinisher()
    +  .setInputCols("instructor_embeddings")
    +  .setOutputCols("finished_embeddings")
    +  .setOutputAsVector(true)
    +
    +val pipeline = new Pipeline().setStages(Array(
    +  documentAssembler,
    +  embeddings,
    +  embeddingsFinisher
    +))
    +
    +val data = Seq("Dynamical Scalar Degree of Freedom in Horava-Lifshitz Gravity").toDF("text")
    +val result = pipeline.fit(data).transform(data)
    +
    +result.selectExpr("explode(finished_embeddings) as result").show(1, 80)
    ++--------------------------------------------------------------------------------+
    +|                                                                          result|
    ++--------------------------------------------------------------------------------+
    +|[-2.3497989177703857,0.480538547039032,-0.3238905668258667,-1.612930893898010...|
    ++--------------------------------------------------------------------------------+
    See also

    + Annotators Main Page for a list of transformer + based embeddings

    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      +
    1. Grouped
    2. +
    3. Alphabetic
    4. +
    5. By Inheritance
    6. +
    +
    +
    + Inherited
    +
    +
      +
    1. InstructorEmbeddings
    2. HasEngine
    3. HasCaseSensitiveProperties
    4. WriteSentencePieceModel
    5. HasStorageRef
    6. HasEmbeddingsProperties
    7. HasProtectedParams
    8. WriteTensorflowModel
    9. HasBatchedAnnotate
    10. AnnotatorModel
    11. CanBeLazy
    12. RawAnnotator
    13. HasOutputAnnotationCol
    14. HasInputAnnotationCols
    15. HasOutputAnnotatorType
    16. ParamsAndFeaturesWritable
    17. HasFeatures
    18. DefaultParamsWritable
    19. MLWritable
    20. Model
    21. Transformer
    22. PipelineStage
    23. Logging
    24. Params
    25. Serializable
    26. Serializable
    27. Identifiable
    28. AnyRef
    29. Any
    30. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + + + + + + + + new + + + InstructorEmbeddings() + + + +
    2. + + + + + + + + + new + + + InstructorEmbeddings(uid: String) + + +

      uid

      + required uid for storing annotator to disk

      +
    +
    + +
    +

    Type Members

    +
    1. + + + + + + + + implicit + class + + + ProtectedParam[T] extends Param[T] + + +
      Definition Classes
      HasProtectedParams
      +
    2. + + + + + + + + + type + + + AnnotationContent = Seq[Row] + + +

      internal types to show Rows as a relevant StructType Should be deleted once Spark releases +UserDefinedTypes to @developerAPI +

      internal types to show Rows as a relevant StructType Should be deleted once Spark releases +UserDefinedTypes to @developerAPI +

      Attributes
      protected
      Definition Classes
      AnnotatorModel
      +
    3. + + + + + + + + + type + + + AnnotatorType = String + + +
      Definition Classes
      HasOutputAnnotatorType
      +
    +
    + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + $[T](param: Param[T]): T + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    4. + + + + + + + + + def + + + $$[T](feature: StructFeature[T]): T + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    5. + + + + + + + + + def + + + $$[K, V](feature: MapFeature[K, V]): Map[K, V] + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    6. + + + + + + + + + def + + + $$[T](feature: SetFeature[T]): Set[T] + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    7. + + + + + + + + + def + + + $$[T](feature: ArrayFeature[T]): Array[T] + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    8. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    9. + + + + + + + + + def + + + _transform(dataset: Dataset[_], recursivePipeline: Option[PipelineModel]): DataFrame + + +
      Attributes
      protected
      Definition Classes
      AnnotatorModel
      +
    10. + + + + + + + + + def + + + afterAnnotate(dataset: DataFrame): DataFrame + + +
      Attributes
      protected
      Definition Classes
      InstructorEmbeddingsAnnotatorModel
      +
    11. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    12. + + + + + + + + + def + + + batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] + + +

      takes a document and annotations and produces new annotations of this annotator's annotation +type +

      takes a document and annotations and produces new annotations of this annotator's annotation +type +

      batchedAnnotations

      + Annotations that correspond to inputAnnotationCols generated by previous annotators if any

      returns

      + any number of annotations processed for every input annotation. Not necessary one to one + relationship

      Definition Classes
      InstructorEmbeddingsHasBatchedAnnotate
      +
    13. + + + + + + + + + def + + + batchProcess(rows: Iterator[_]): Iterator[Row] + + +
      Definition Classes
      HasBatchedAnnotate
      +
    14. + + + + + + + + + val + + + batchSize: IntParam + + +

      Size of every batch (Default depends on model).

      Size of every batch (Default depends on model). +

      Definition Classes
      HasBatchedAnnotate
      +
    15. + + + + + + + + + def + + + beforeAnnotate(dataset: Dataset[_]): Dataset[_] + + +
      Attributes
      protected
      Definition Classes
      AnnotatorModel
      +
    16. + + + + + + + + + val + + + caseSensitive: BooleanParam + + +

      Whether to ignore case in index lookups (Default depends on model) +

      Whether to ignore case in index lookups (Default depends on model) +

      Definition Classes
      HasCaseSensitiveProperties
      +
    17. + + + + + + + + final + def + + + checkSchema(schema: StructType, inputAnnotatorType: String): Boolean + + +
      Attributes
      protected
      Definition Classes
      HasInputAnnotationCols
      +
    18. + + + + + + + + final + def + + + clear(param: Param[_]): InstructorEmbeddings.this.type + + +
      Definition Classes
      Params
      +
    19. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    20. + + + + + + + + + val + + + configProtoBytes: IntArrayParam + + +

      ConfigProto from tensorflow, serialized into byte array.

      ConfigProto from tensorflow, serialized into byte array. Get with +config_proto.SerializeToString() +

      +
    21. + + + + + + + + + def + + + copy(extra: ParamMap): InstructorEmbeddings + + +

      requirement for annotators copies

      requirement for annotators copies

      Definition Classes
      RawAnnotator → Model → Transformer → PipelineStage → Params
      +
    22. + + + + + + + + + def + + + copyValues[T <: Params](to: T, extra: ParamMap): T + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    23. + + + + + + + + + def + + + createDatabaseConnection(database: Name): RocksDBConnection + + +
      Definition Classes
      HasStorageRef
      +
    24. + + + + + + + + final + def + + + defaultCopy[T <: Params](extra: ParamMap): T + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    25. + + + + + + + + + val + + + dimension: ProtectedParam[Int] + + +

      Number of embedding dimensions (Default depends on model) +

      Number of embedding dimensions (Default depends on model) +

      Definition Classes
      HasEmbeddingsProperties
      +
    26. + + + + + + + + + val + + + engine: Param[String] + + +

      This param is set internally once via loadSavedModel.

      This param is set internally once via loadSavedModel. That's why there is no setter +

      Definition Classes
      HasEngine
      +
    27. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    28. + + + + + + + + + def + + + equals(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    29. + + + + + + + + + def + + + explainParam(param: Param[_]): String + + +
      Definition Classes
      Params
      +
    30. + + + + + + + + + def + + + explainParams(): String + + +
      Definition Classes
      Params
      +
    31. + + + + + + + + + def + + + extraValidate(structType: StructType): Boolean + + +
      Attributes
      protected
      Definition Classes
      RawAnnotator
      +
    32. + + + + + + + + + def + + + extraValidateMsg: String + + +

      Override for additional custom schema checks

      Override for additional custom schema checks

      Attributes
      protected
      Definition Classes
      RawAnnotator
      +
    33. + + + + + + + + final + def + + + extractParamMap(): ParamMap + + +
      Definition Classes
      Params
      +
    34. + + + + + + + + final + def + + + extractParamMap(extra: ParamMap): ParamMap + + +
      Definition Classes
      Params
      +
    35. + + + + + + + + + val + + + features: ArrayBuffer[Feature[_, _, _]] + + +
      Definition Classes
      HasFeatures
      +
    36. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    37. + + + + + + + + + def + + + get[T](feature: StructFeature[T]): Option[T] + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    38. + + + + + + + + + def + + + get[K, V](feature: MapFeature[K, V]): Option[Map[K, V]] + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    39. + + + + + + + + + def + + + get[T](feature: SetFeature[T]): Option[Set[T]] + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    40. + + + + + + + + + def + + + get[T](feature: ArrayFeature[T]): Option[Array[T]] + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    41. + + + + + + + + final + def + + + get[T](param: Param[T]): Option[T] + + +
      Definition Classes
      Params
      +
    42. + + + + + + + + + def + + + getBatchSize: Int + + +

      Size of every batch.

      Size of every batch. +

      Definition Classes
      HasBatchedAnnotate
      +
    43. + + + + + + + + + def + + + getCaseSensitive: Boolean + + +

      Definition Classes
      HasCaseSensitiveProperties
      +
    44. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    45. + + + + + + + + + def + + + getConfigProtoBytes: Option[Array[Byte]] + + +

      +
    46. + + + + + + + + final + def + + + getDefault[T](param: Param[T]): Option[T] + + +
      Definition Classes
      Params
      +
    47. + + + + + + + + + def + + + getDimension: Int + + +

      Definition Classes
      HasEmbeddingsProperties
      +
    48. + + + + + + + + + def + + + getEngine: String + + +

      Definition Classes
      HasEngine
      +
    49. + + + + + + + + + def + + + getInputCols: Array[String] + + +

      returns

      input annotations columns currently used

      Definition Classes
      HasInputAnnotationCols
      +
    50. + + + + + + + + + def + + + getLazyAnnotator: Boolean + + +
      Definition Classes
      CanBeLazy
      +
    51. + + + + + + + + + def + + + getMaxSentenceLength: Int + + +

      +
    52. + + + + + + + + + def + + + getModelIfNotSet: Instructor + + +

      +
    53. + + + + + + + + final + def + + + getOrDefault[T](param: Param[T]): T + + +
      Definition Classes
      Params
      +
    54. + + + + + + + + final + def + + + getOutputCol: String + + +

      Gets annotation column name going to generate

      Gets annotation column name going to generate

      Definition Classes
      HasOutputAnnotationCol
      +
    55. + + + + + + + + + def + + + getParam(paramName: String): Param[Any] + + +
      Definition Classes
      Params
      +
    56. + + + + + + + + + def + + + getSignatures: Option[Map[String, String]] + + +

      +
    57. + + + + + + + + + def + + + getStorageRef: String + + +
      Definition Classes
      HasStorageRef
      +
    58. + + + + + + + + final + def + + + hasDefault[T](param: Param[T]): Boolean + + +
      Definition Classes
      Params
      +
    59. + + + + + + + + + def + + + hasParam(paramName: String): Boolean + + +
      Definition Classes
      Params
      +
    60. + + + + + + + + + def + + + hasParent: Boolean + + +
      Definition Classes
      Model
      +
    61. + + + + + + + + + def + + + hashCode(): Int + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    62. + + + + + + + + + def + + + initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    63. + + + + + + + + + def + + + initializeLogIfNecessary(isInterpreter: Boolean): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    64. + + + + + + + + + val + + + inputAnnotatorTypes: Array[String] + + +

      Annotator reference id.

      Annotator reference id. Used to identify elements in metadata or to refer to this annotator +type +

      Definition Classes
      InstructorEmbeddingsHasInputAnnotationCols
      +
    65. + + + + + + + + final + val + + + inputCols: StringArrayParam + + +

      columns that contain annotations necessary to run this annotator AnnotatorType is used both +as input and output columns if not specified +

      columns that contain annotations necessary to run this annotator AnnotatorType is used both +as input and output columns if not specified +

      Attributes
      protected
      Definition Classes
      HasInputAnnotationCols
      +
    66. + + + + + + + + + val + + + instruction: Param[String] + + +

      Set transformer instruction, e.g.

      Set transformer instruction, e.g. 'summarize' format: "instruction:". +

      +
    67. + + + + + + + + final + def + + + isDefined(param: Param[_]): Boolean + + +
      Definition Classes
      Params
      +
    68. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    69. + + + + + + + + final + def + + + isSet(param: Param[_]): Boolean + + +
      Definition Classes
      Params
      +
    70. + + + + + + + + + def + + + isTraceEnabled(): Boolean + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    71. + + + + + + + + + val + + + lazyAnnotator: BooleanParam + + +
      Definition Classes
      CanBeLazy
      +
    72. + + + + + + + + + def + + + log: Logger + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    73. + + + + + + + + + def + + + logDebug(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    74. + + + + + + + + + def + + + logDebug(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    75. + + + + + + + + + def + + + logError(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    76. + + + + + + + + + def + + + logError(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    77. + + + + + + + + + def + + + logInfo(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    78. + + + + + + + + + def + + + logInfo(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    79. + + + + + + + + + def + + + logName: String + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    80. + + + + + + + + + def + + + logTrace(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    81. + + + + + + + + + def + + + logTrace(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    82. + + + + + + + + + def + + + logWarning(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    83. + + + + + + + + + def + + + logWarning(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    84. + + + + + + + + + val + + + maxSentenceLength: IntParam + + +

      Max sentence length to process (Default: 128) +

      +
    85. + + + + + + + + + def + + + msgHelper(schema: StructType): String + + +
      Attributes
      protected
      Definition Classes
      HasInputAnnotationCols
      +
    86. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    87. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    88. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    89. + + + + + + + + + def + + + onWrite(path: String, spark: SparkSession): Unit + + + +
    90. + + + + + + + + + val + + + optionalInputAnnotatorTypes: Array[String] + + +
      Definition Classes
      HasInputAnnotationCols
      +
    91. + + + + + + + + + val + + + outputAnnotatorType: AnnotatorType + + + +
    92. + + + + + + + + final + val + + + outputCol: Param[String] + + +
      Attributes
      protected
      Definition Classes
      HasOutputAnnotationCol
      +
    93. + + + + + + + + + lazy val + + + params: Array[Param[_]] + + +
      Definition Classes
      Params
      +
    94. + + + + + + + + + var + + + parent: Estimator[InstructorEmbeddings] + + +
      Definition Classes
      Model
      +
    95. + + + + + + + + + def + + + save(path: String): Unit + + +
      Definition Classes
      MLWritable
      Annotations
      + @Since( + + "1.6.0" + ) + + @throws( + + ... + ) + +
      +
    96. + + + + + + + + + def + + + set[T](param: ProtectedParam[T], value: T): InstructorEmbeddings.this.type + + +

      Sets the value for a protected Param.

      Sets the value for a protected Param.

      If the parameter was already set, it will not be set again. Default values do not count as a +set value and can be overridden. +

      T

      + Type of the parameter

      param

      + Protected parameter to set

      value

      + Value for the parameter

      returns

      + This object

      Definition Classes
      HasProtectedParams
      +
    97. + + + + + + + + + def + + + set[T](feature: StructFeature[T], value: T): InstructorEmbeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    98. + + + + + + + + + def + + + set[K, V](feature: MapFeature[K, V], value: Map[K, V]): InstructorEmbeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    99. + + + + + + + + + def + + + set[T](feature: SetFeature[T], value: Set[T]): InstructorEmbeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    100. + + + + + + + + + def + + + set[T](feature: ArrayFeature[T], value: Array[T]): InstructorEmbeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    101. + + + + + + + + final + def + + + set(paramPair: ParamPair[_]): InstructorEmbeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    102. + + + + + + + + final + def + + + set(param: String, value: Any): InstructorEmbeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    103. + + + + + + + + final + def + + + set[T](param: Param[T], value: T): InstructorEmbeddings.this.type + + +
      Definition Classes
      Params
      +
    104. + + + + + + + + + def + + + setBatchSize(size: Int): InstructorEmbeddings.this.type + + +

      Size of every batch.

      Size of every batch. +

      Definition Classes
      HasBatchedAnnotate
      +
    105. + + + + + + + + + def + + + setCaseSensitive(value: Boolean): InstructorEmbeddings.this.type + + +

      Whether to lowercase tokens or not +

      Whether to lowercase tokens or not +

      Definition Classes
      InstructorEmbeddingsHasCaseSensitiveProperties
      +
    106. + + + + + + + + + def + + + setConfigProtoBytes(bytes: Array[Int]): InstructorEmbeddings.this.type + + +

      +
    107. + + + + + + + + + def + + + setDefault[T](feature: StructFeature[T], value: () ⇒ T): InstructorEmbeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    108. + + + + + + + + + def + + + setDefault[K, V](feature: MapFeature[K, V], value: () ⇒ Map[K, V]): InstructorEmbeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    109. + + + + + + + + + def + + + setDefault[T](feature: SetFeature[T], value: () ⇒ Set[T]): InstructorEmbeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    110. + + + + + + + + + def + + + setDefault[T](feature: ArrayFeature[T], value: () ⇒ Array[T]): InstructorEmbeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      HasFeatures
      +
    111. + + + + + + + + final + def + + + setDefault(paramPairs: ParamPair[_]*): InstructorEmbeddings.this.type + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    112. + + + + + + + + final + def + + + setDefault[T](param: Param[T], value: T): InstructorEmbeddings.this.type + + +
      Attributes
      protected[org.apache.spark.ml]
      Definition Classes
      Params
      +
    113. + + + + + + + + + def + + + setDimension(value: Int): InstructorEmbeddings.this.type + + +

      Set Embeddings dimensions for the BERT model Only possible to set this when the first time +is saved dimension is not changeable, it comes from BERT config file +

      Set Embeddings dimensions for the BERT model Only possible to set this when the first time +is saved dimension is not changeable, it comes from BERT config file +

      Definition Classes
      InstructorEmbeddingsHasEmbeddingsProperties
      +
    114. + + + + + + + + final + def + + + setInputCols(value: String*): InstructorEmbeddings.this.type + + +
      Definition Classes
      HasInputAnnotationCols
      +
    115. + + + + + + + + + def + + + setInputCols(value: Array[String]): InstructorEmbeddings.this.type + + +

      Overrides required annotators column if different than default

      Overrides required annotators column if different than default

      Definition Classes
      HasInputAnnotationCols
      +
    116. + + + + + + + + + def + + + setInstruction(value: String): InstructorEmbeddings.this.type + + + +
    117. + + + + + + + + + def + + + setLazyAnnotator(value: Boolean): InstructorEmbeddings.this.type + + +
      Definition Classes
      CanBeLazy
      +
    118. + + + + + + + + + def + + + setMaxSentenceLength(value: Int): InstructorEmbeddings.this.type + + +

      +
    119. + + + + + + + + + def + + + setModelIfNotSet(spark: SparkSession, tensorflowWrapper: TensorflowWrapper, spp: SentencePieceWrapper): InstructorEmbeddings + + +

      +
    120. + + + + + + + + final + def + + + setOutputCol(value: String): InstructorEmbeddings.this.type + + +

      Overrides annotation column name when transforming

      Overrides annotation column name when transforming

      Definition Classes
      HasOutputAnnotationCol
      +
    121. + + + + + + + + + def + + + setParent(parent: Estimator[InstructorEmbeddings]): InstructorEmbeddings + + +
      Definition Classes
      Model
      +
    122. + + + + + + + + + def + + + setSignatures(value: Map[String, String]): InstructorEmbeddings.this.type + + +

      +
    123. + + + + + + + + + def + + + setStorageRef(value: String): InstructorEmbeddings.this.type + + +
      Definition Classes
      HasStorageRef
      +
    124. + + + + + + + + + val + + + signatures: MapFeature[String, String] + + +

      It contains TF model signatures for the laded saved model +

      +
    125. + + + + + + + + + val + + + storageRef: Param[String] + + +

      Unique identifier for storage (Default: this.uid) +

      Unique identifier for storage (Default: this.uid) +

      Definition Classes
      HasStorageRef
      +
    126. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    127. + + + + + + + + + def + + + toString(): String + + +
      Definition Classes
      Identifiable → AnyRef → Any
      +
    128. + + + + + + + + final + def + + + transform(dataset: Dataset[_]): DataFrame + + +

      Given requirements are met, this applies ML transformation within a Pipeline or stand-alone +Output annotation will be generated as a new column, previous annotations are still +available separately metadata is built at schema level to record annotations structural +information outside its content +

      Given requirements are met, this applies ML transformation within a Pipeline or stand-alone +Output annotation will be generated as a new column, previous annotations are still +available separately metadata is built at schema level to record annotations structural +information outside its content +

      dataset

      + Dataset[Row]

      Definition Classes
      AnnotatorModel → Transformer
      +
    129. + + + + + + + + + def + + + transform(dataset: Dataset[_], paramMap: ParamMap): DataFrame + + +
      Definition Classes
      Transformer
      Annotations
      + @Since( + + "2.0.0" + ) + +
      +
    130. + + + + + + + + + def + + + transform(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): DataFrame + + +
      Definition Classes
      Transformer
      Annotations
      + @Since( + + "2.0.0" + ) + + @varargs() + +
      +
    131. + + + + + + + + final + def + + + transformSchema(schema: StructType): StructType + + +

      requirement for pipeline transformation validation.

      requirement for pipeline transformation validation. It is called on fit()

      Definition Classes
      RawAnnotator → PipelineStage
      +
    132. + + + + + + + + + def + + + transformSchema(schema: StructType, logging: Boolean): StructType + + +
      Attributes
      protected
      Definition Classes
      PipelineStage
      Annotations
      + @DeveloperApi() + +
      +
    133. + + + + + + + + + val + + + uid: String + + +
      Definition Classes
      InstructorEmbeddings → Identifiable
      +
    134. + + + + + + + + + def + + + validate(schema: StructType): Boolean + + +

      takes a Dataset and checks to see if all the required annotation types are present.

      takes a Dataset and checks to see if all the required annotation types are present. +

      schema

      + to be validated

      returns

      + True if all the required types are present, else false

      Attributes
      protected
      Definition Classes
      RawAnnotator
      +
    135. + + + + + + + + + def + + + validateStorageRef(dataset: Dataset[_], inputCols: Array[String], annotatorType: String): Unit + + +
      Definition Classes
      HasStorageRef
      +
    136. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    137. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    138. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    139. + + + + + + + + + def + + + wrapColumnMetadata(col: Column): Column + + +
      Attributes
      protected
      Definition Classes
      RawAnnotator
      +
    140. + + + + + + + + + def + + + wrapEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column + + +
      Attributes
      protected
      Definition Classes
      HasEmbeddingsProperties
      +
    141. + + + + + + + + + def + + + wrapSentenceEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column + + +
      Attributes
      protected
      Definition Classes
      HasEmbeddingsProperties
      +
    142. + + + + + + + + + def + + + write: MLWriter + + +
      Definition Classes
      ParamsAndFeaturesWritable → DefaultParamsWritable → MLWritable
      +
    143. + + + + + + + + + def + + + writeSentencePieceModel(path: String, spark: SparkSession, spp: SentencePieceWrapper, suffix: String, filename: String): Unit + + +
      Definition Classes
      WriteSentencePieceModel
      +
    144. + + + + + + + + + def + + + writeTensorflowHub(path: String, tfPath: String, spark: SparkSession, suffix: String = "_use"): Unit + + +
      Definition Classes
      WriteTensorflowModel
      +
    145. + + + + + + + + + def + + + writeTensorflowModel(path: String, spark: SparkSession, tensorflow: TensorflowWrapper, suffix: String, filename: String, configProtoBytes: Option[Array[Byte]] = None): Unit + + +
      Definition Classes
      WriteTensorflowModel
      +
    146. + + + + + + + + + def + + + writeTensorflowModelV2(path: String, spark: SparkSession, tensorflow: TensorflowWrapper, suffix: String, filename: String, configProtoBytes: Option[Array[Byte]] = None, savedSignatures: Option[Map[String, String]] = None): Unit + + +
      Definition Classes
      WriteTensorflowModel
      +
    147. +
    +
    + + + + +
    + +
    +
    +

    Inherited from HasEngine

    +
    +

    Inherited from HasCaseSensitiveProperties

    +
    +

    Inherited from WriteSentencePieceModel

    +
    +

    Inherited from HasStorageRef

    +
    +

    Inherited from HasEmbeddingsProperties

    +
    +

    Inherited from HasProtectedParams

    +
    +

    Inherited from WriteTensorflowModel

    +
    +

    Inherited from AnnotatorModel[InstructorEmbeddings]

    +
    +

    Inherited from CanBeLazy

    +
    +

    Inherited from RawAnnotator[InstructorEmbeddings]

    +
    +

    Inherited from HasOutputAnnotationCol

    +
    +

    Inherited from HasInputAnnotationCols

    +
    +

    Inherited from HasOutputAnnotatorType

    +
    +

    Inherited from ParamsAndFeaturesWritable

    +
    +

    Inherited from HasFeatures

    +
    +

    Inherited from DefaultParamsWritable

    +
    +

    Inherited from MLWritable

    +
    +

    Inherited from Model[InstructorEmbeddings]

    +
    +

    Inherited from Transformer

    +
    +

    Inherited from PipelineStage

    +
    +

    Inherited from Logging

    +
    +

    Inherited from Params

    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Identifiable

    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Parameters

    +

    + A list of (hyper-)parameter keys this annotator can take. Users can set and get the + parameter values through setters and getters, respectively.

    +
    +

    Members

    + +
    +

    Parameter setters

    + +
    +

    Parameter getters

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings$.html index d0b5c03fe26d21..9712902541048c 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.LongformerEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.LongformerEmbeddings + + @@ -28,7 +28,7 @@

  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.html index a93418fa86e6ce..2077afdc147003 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.LongformerEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.LongformerEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • @@ -2638,9 +2662,9 @@

    Value Members

  • - + - + @@ -2649,7 +2673,7 @@

    Value Members

    def - setModelIfNotSet(spark: SparkSession, tensorflowWrapper: TensorflowWrapper): LongformerEmbeddings + setModelIfNotSet(spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper]): LongformerEmbeddings

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/PoolingStrategy$$AnnotatorType$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/PoolingStrategy$$AnnotatorType$.html index 954653c6859c1c..8a8e83c51b580d 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/PoolingStrategy$$AnnotatorType$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/PoolingStrategy$$AnnotatorType$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.PoolingStrategy.AnnotatorType - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.PoolingStrategy.AnnotatorType + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadAlbertDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadAlbertDLModel.html index e7236b0d3a3d73..98f0422ed7ca48 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadAlbertDLModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadAlbertDLModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadAlbertDLModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadAlbertDLModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadBertDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadBertDLModel.html index bc574f41a882e4..234d6cd75bfa68 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadBertDLModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadBertDLModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadBertDLModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadBertDLModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • @@ -400,7 +424,7 @@

    trait - ReadBertDLModel extends ReadTensorflowModel + ReadBertDLModel extends ReadTensorflowModel with ReadOnnxModel

    @@ -409,7 +433,7 @@

    Linear Supertypes -
    ReadTensorflowModel, AnyRef, Any
    +
    Known Subclasses @@ -440,7 +464,7 @@

    Inherited
      -
    1. ReadBertDLModel
    2. ReadTensorflowModel
    3. AnyRef
    4. Any
    5. +
    6. ReadBertDLModel
    7. ReadOnnxModel
    8. ReadTensorflowModel
    9. AnyRef
    10. Any

    @@ -733,6 +757,22 @@

    Value Members

    @native()
    +

  • + + + + + + + + + val + + + onnxFile: String + + +
    Definition Classes
    ReadBertDLModelReadOnnxModel
  • @@ -749,6 +789,22 @@

    Value Members

    +
  • + + + + + + + + + def + + + readOnnxModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, sessionOptions: Option[SessionOptions] = None): OnnxWrapper + + +
    Definition Classes
    ReadOnnxModel
  • @@ -939,7 +995,9 @@

    Value Members

    -
    +
    +

    Inherited from ReadOnnxModel

    +

    Inherited from ReadTensorflowModel

    Inherited from AnyRef

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadBertSentenceDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadBertSentenceDLModel.html index 9d05c1c2ab2b77..e95997cee9e0f1 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadBertSentenceDLModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadBertSentenceDLModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadBertSentenceDLModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadBertSentenceDLModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • @@ -400,7 +424,7 @@

    trait - ReadBertSentenceDLModel extends ReadTensorflowModel + ReadBertSentenceDLModel extends ReadTensorflowModel with ReadOnnxModel

    @@ -409,7 +433,7 @@

    Linear Supertypes -
    ReadTensorflowModel, AnyRef, Any
    +
    Known Subclasses @@ -440,7 +464,7 @@

    Inherited
      -
    1. ReadBertSentenceDLModel
    2. ReadTensorflowModel
    3. AnyRef
    4. Any
    5. +
    6. ReadBertSentenceDLModel
    7. ReadOnnxModel
    8. ReadTensorflowModel
    9. AnyRef
    10. Any

    @@ -733,6 +757,22 @@

    Value Members

    @native()
    +

  • + + + + + + + + + val + + + onnxFile: String + + +
    Definition Classes
    ReadBertSentenceDLModelReadOnnxModel
  • @@ -749,6 +789,22 @@

    Value Members

    +
  • + + + + + + + + + def + + + readOnnxModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, sessionOptions: Option[SessionOptions] = None): OnnxWrapper + + +
    Definition Classes
    ReadOnnxModel
  • @@ -939,7 +995,9 @@

    Value Members

    -
    +
    +

    Inherited from ReadOnnxModel

    +

    Inherited from ReadTensorflowModel

    Inherited from AnyRef

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadCamemBertDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadCamemBertDLModel.html index 29b3b092f82837..c8b05dd0e26ec0 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadCamemBertDLModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadCamemBertDLModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadCamemBertDLModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadCamemBertDLModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadDeBertaDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadDeBertaDLModel.html index 7f69842f2d8efb..b31417a56f58d2 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadDeBertaDLModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadDeBertaDLModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadDeBertaDLModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadDeBertaDLModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • @@ -400,7 +424,7 @@

    trait - ReadDeBertaDLModel extends ReadTensorflowModel with ReadSentencePieceModel + ReadDeBertaDLModel extends ReadTensorflowModel with ReadSentencePieceModel with ReadOnnxModel

    @@ -409,7 +433,7 @@

    Linear Supertypes - +
    Known Subclasses @@ -440,7 +464,7 @@

    Inherited
      -
    1. ReadDeBertaDLModel
    2. ReadSentencePieceModel
    3. ReadTensorflowModel
    4. AnyRef
    5. Any
    6. +
    7. ReadDeBertaDLModel
    8. ReadOnnxModel
    9. ReadSentencePieceModel
    10. ReadTensorflowModel
    11. AnyRef
    12. Any

    @@ -733,6 +757,22 @@

    Value Members

    @native()
    +

  • + + + + + + + + + val + + + onnxFile: String + + +
    Definition Classes
    ReadDeBertaDLModelReadOnnxModel
  • @@ -749,6 +789,22 @@

    Value Members

    +
  • + + + + + + + + + def + + + readOnnxModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, sessionOptions: Option[SessionOptions] = None): OnnxWrapper + + +
    Definition Classes
    ReadOnnxModel
  • @@ -971,7 +1027,9 @@

    Value Members

    -
    +
    +

    Inherited from ReadOnnxModel

    +

    Inherited from ReadSentencePieceModel

    Inherited from ReadTensorflowModel

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadDistilBertDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadDistilBertDLModel.html index 43c03982c2a5c0..21cfa5bb69a012 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadDistilBertDLModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadDistilBertDLModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadDistilBertDLModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadDistilBertDLModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • @@ -400,7 +424,7 @@

    trait - ReadDistilBertDLModel extends ReadTensorflowModel + ReadDistilBertDLModel extends ReadTensorflowModel with ReadOnnxModel

    @@ -409,7 +433,7 @@

    Linear Supertypes -
    ReadTensorflowModel, AnyRef, Any
    +
    Known Subclasses @@ -440,7 +464,7 @@

    Inherited
      -
    1. ReadDistilBertDLModel
    2. ReadTensorflowModel
    3. AnyRef
    4. Any
    5. +
    6. ReadDistilBertDLModel
    7. ReadOnnxModel
    8. ReadTensorflowModel
    9. AnyRef
    10. Any

    @@ -733,6 +757,22 @@

    Value Members

    @native()
    +

  • + + + + + + + + + val + + + onnxFile: String + + +
    Definition Classes
    ReadDistilBertDLModelReadOnnxModel
  • @@ -749,6 +789,22 @@

    Value Members

    +
  • + + + + + + + + + def + + + readOnnxModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, sessionOptions: Option[SessionOptions] = None): OnnxWrapper + + +
    Definition Classes
    ReadOnnxModel
  • @@ -939,7 +995,9 @@

    Value Members

    -
    +
    +

    Inherited from ReadOnnxModel

    +

    Inherited from ReadTensorflowModel

    Inherited from AnyRef

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadE5DLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadE5DLModel.html new file mode 100644 index 00000000000000..36454c5d8f98b7 --- /dev/null +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadE5DLModel.html @@ -0,0 +1,993 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadE5DLModel + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + +
    +
    t
    +

    com.johnsnowlabs.nlp.embeddings

    +

    ReadE5DLModel + + + +

    +

    +
    + +

    + + + trait + + + ReadE5DLModel extends ReadTensorflowModel + +

    + + +
    Self Type
    ReadE5DLModel with ParamsAndFeaturesReadable[E5Embeddings]
    + + Linear Supertypes + +
    ReadTensorflowModel, AnyRef, Any
    +
    + + Known Subclasses + + +
    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. ReadE5DLModel
    2. ReadTensorflowModel
    3. AnyRef
    4. Any
    5. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    + + + + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    5. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    6. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    7. + + + + + + + + + def + + + equals(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    10. + + + + + + + + + def + + + hashCode(): Int + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    11. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    12. + + + + + + + + + def + + + loadSavedModel(modelPath: String, spark: SparkSession): E5Embeddings + + + +
    13. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    14. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    15. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    16. + + + + + + + + + def + + + readModel(instance: E5Embeddings, path: String, spark: SparkSession): Unit + + + +
    17. + + + + + + + + + def + + + readTensorflowChkPoints(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, tags: Array[String] = Array.empty, initAllTables: Boolean = false): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    18. + + + + + + + + + def + + + readTensorflowHub(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, tags: Array[String] = Array.empty): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    19. + + + + + + + + + def + + + readTensorflowModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, tags: Array[String] = Array.empty, initAllTables: Boolean = false, savedSignatures: Option[Map[String, String]] = None): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    20. + + + + + + + + + def + + + readTensorflowWithSPModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, tags: Array[String] = Array.empty, initAllTables: Boolean = false, loadSP: Boolean = false): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    21. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    22. + + + + + + + + + val + + + tfFile: String + + +
      Definition Classes
      ReadE5DLModelReadTensorflowModel
      +
    23. + + + + + + + + + def + + + toString(): String + + +
      Definition Classes
      AnyRef → Any
      +
    24. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    25. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    26. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    27. +
    +
    + + + + +
    + +
    +
    +

    Inherited from ReadTensorflowModel

    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadElmoDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadElmoDLModel.html index 7a025e84eae004..53c10e8c47f262 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadElmoDLModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadElmoDLModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadElmoDLModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadElmoDLModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadInstructorDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadInstructorDLModel.html new file mode 100644 index 00000000000000..e951dc505b7c6f --- /dev/null +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadInstructorDLModel.html @@ -0,0 +1,1027 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadInstructorDLModel + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + +
    +
    t
    +

    com.johnsnowlabs.nlp.embeddings

    +

    ReadInstructorDLModel + + + +

    +

    +
    + +

    + + + trait + + + ReadInstructorDLModel extends ReadTensorflowModel with ReadSentencePieceModel + +

    + + +
    Self Type
    ReadInstructorDLModel with ParamsAndFeaturesReadable[InstructorEmbeddings]
    + + Linear Supertypes + + +
    + + Known Subclasses + + +
    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. ReadInstructorDLModel
    2. ReadSentencePieceModel
    3. ReadTensorflowModel
    4. AnyRef
    5. Any
    6. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    + + + + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    5. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    6. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    7. + + + + + + + + + def + + + equals(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    10. + + + + + + + + + def + + + hashCode(): Int + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    11. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    12. + + + + + + + + + def + + + loadSavedModel(modelPath: String, spark: SparkSession): InstructorEmbeddings + + + +
    13. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    14. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    15. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    16. + + + + + + + + + def + + + readModel(instance: InstructorEmbeddings, path: String, spark: SparkSession): Unit + + + +
    17. + + + + + + + + + def + + + readSentencePieceModel(path: String, spark: SparkSession, suffix: String, filename: String): SentencePieceWrapper + + +
      Definition Classes
      ReadSentencePieceModel
      +
    18. + + + + + + + + + def + + + readTensorflowChkPoints(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, tags: Array[String] = Array.empty, initAllTables: Boolean = false): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    19. + + + + + + + + + def + + + readTensorflowHub(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, tags: Array[String] = Array.empty): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    20. + + + + + + + + + def + + + readTensorflowModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, tags: Array[String] = Array.empty, initAllTables: Boolean = false, savedSignatures: Option[Map[String, String]] = None): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    21. + + + + + + + + + def + + + readTensorflowWithSPModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, tags: Array[String] = Array.empty, initAllTables: Boolean = false, loadSP: Boolean = false): TensorflowWrapper + + +
      Definition Classes
      ReadTensorflowModel
      +
    22. + + + + + + + + + val + + + sppFile: String + + + +
    23. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    24. + + + + + + + + + val + + + tfFile: String + + + +
    25. + + + + + + + + + def + + + toString(): String + + +
      Definition Classes
      AnyRef → Any
      +
    26. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    27. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    28. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    29. +
    +
    + + + + +
    + +
    +
    +

    Inherited from ReadSentencePieceModel

    +
    +

    Inherited from ReadTensorflowModel

    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadLongformerDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadLongformerDLModel.html index b045bc66cd9592..9b8ead79df452a 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadLongformerDLModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadLongformerDLModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadLongformerDLModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadLongformerDLModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadRobertaDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadRobertaDLModel.html index 81711c8ae35405..5b81916ad7c6ed 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadRobertaDLModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadRobertaDLModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadRobertaDLModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadRobertaDLModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • @@ -400,7 +424,7 @@

    trait - ReadRobertaDLModel extends ReadTensorflowModel + ReadRobertaDLModel extends ReadTensorflowModel with ReadOnnxModel

    @@ -409,7 +433,7 @@

    Linear Supertypes -
    ReadTensorflowModel, AnyRef, Any
    +
    Known Subclasses @@ -440,7 +464,7 @@

    Inherited
      -
    1. ReadRobertaDLModel
    2. ReadTensorflowModel
    3. AnyRef
    4. Any
    5. +
    6. ReadRobertaDLModel
    7. ReadOnnxModel
    8. ReadTensorflowModel
    9. AnyRef
    10. Any

    @@ -733,6 +757,22 @@

    Value Members

    @native()
    +

  • + + + + + + + + + val + + + onnxFile: String + + +
    Definition Classes
    ReadRobertaDLModelReadOnnxModel
  • @@ -749,6 +789,22 @@

    Value Members

    +
  • + + + + + + + + + def + + + readOnnxModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, sessionOptions: Option[SessionOptions] = None): OnnxWrapper + + +
    Definition Classes
    ReadOnnxModel
  • @@ -939,7 +995,9 @@

    Value Members

    -
    +
    +

    Inherited from ReadOnnxModel

    +

    Inherited from ReadTensorflowModel

    Inherited from AnyRef

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadRobertaSentenceDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadRobertaSentenceDLModel.html index 2fe4acddcbae62..717471682e2500 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadRobertaSentenceDLModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadRobertaSentenceDLModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadRobertaSentenceDLModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadRobertaSentenceDLModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadUSEDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadUSEDLModel.html index acb472746b7802..60ccc5912a0a44 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadUSEDLModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadUSEDLModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadUSEDLModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadUSEDLModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadXlmRobertaDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadXlmRobertaDLModel.html index 001f44c419cd60..d15bf90bdccf72 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadXlmRobertaDLModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadXlmRobertaDLModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadXlmRobertaDLModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadXlmRobertaDLModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadXlmRobertaSentenceDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadXlmRobertaSentenceDLModel.html index b1fb9e4004d0c5..42c1f5386ba1c6 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadXlmRobertaSentenceDLModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadXlmRobertaSentenceDLModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadXlmRobertaSentenceDLModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadXlmRobertaSentenceDLModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadXlnetDLModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadXlnetDLModel.html index 5988ab9e9eec9a..cf26bd27620c15 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadXlnetDLModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadXlnetDLModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadXlnetDLModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadXlnetDLModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedAlbertModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedAlbertModel.html index aec7c348b335bb..5b54da3eb49aff 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedAlbertModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedAlbertModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedAlbertModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedAlbertModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedBertModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedBertModel.html index 01f781f40a15bc..c4bb43955d06d2 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedBertModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedBertModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedBertModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedBertModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedBertSentenceModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedBertSentenceModel.html index 553201f9571905..e7947dd78057e4 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedBertSentenceModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedBertSentenceModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedBertSentenceModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedBertSentenceModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedCamemBertModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedCamemBertModel.html index 1c0343c5a542de..844b789176b802 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedCamemBertModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedCamemBertModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedCamemBertModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedCamemBertModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedDeBertaModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedDeBertaModel.html index 119e572c602697..69c8ee14776b63 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedDeBertaModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedDeBertaModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedDeBertaModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedDeBertaModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedDistilBertModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedDistilBertModel.html index 9b06a12a10b2ab..9680ee78432d52 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedDistilBertModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedDistilBertModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedDistilBertModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedDistilBertModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedDoc2Vec.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedDoc2Vec.html index c450ec65bdf183..feacab8c4b0578 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedDoc2Vec.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedDoc2Vec.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedDoc2Vec - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedDoc2Vec + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedE5Model.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedE5Model.html new file mode 100644 index 00000000000000..3911a04340b328 --- /dev/null +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedE5Model.html @@ -0,0 +1,1053 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedE5Model + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + +
    +
    t
    +

    com.johnsnowlabs.nlp.embeddings

    +

    ReadablePretrainedE5Model + + + +

    +

    +
    + +

    + + + trait + + + ReadablePretrainedE5Model extends ParamsAndFeaturesReadable[E5Embeddings] with HasPretrained[E5Embeddings] + +

    + + +
    + + Linear Supertypes + +
    HasPretrained[E5Embeddings], ParamsAndFeaturesReadable[E5Embeddings], DefaultParamsReadable[E5Embeddings], MLReadable[E5Embeddings], AnyRef, Any
    +
    + + Known Subclasses + + +
    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. ReadablePretrainedE5Model
    2. HasPretrained
    3. ParamsAndFeaturesReadable
    4. DefaultParamsReadable
    5. MLReadable
    6. AnyRef
    7. Any
    8. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    + + + + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + + + + + + + + def + + + addReader(reader: (E5Embeddings, String, SparkSession) ⇒ Unit): Unit + + +
      Definition Classes
      ParamsAndFeaturesReadable
      +
    5. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    6. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    7. + + + + + + + + + val + + + defaultLang: String + + +
      Definition Classes
      HasPretrained
      +
    8. + + + + + + + + + lazy val + + + defaultLoc: String + + +
      Definition Classes
      HasPretrained
      +
    9. + + + + + + + + + val + + + defaultModelName: Some[String] + + + +
    10. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    11. + + + + + + + + + def + + + equals(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    12. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    13. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    14. + + + + + + + + + def + + + hashCode(): Int + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    15. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    16. + + + + + + + + + def + + + load(path: String): E5Embeddings + + +
      Definition Classes
      MLReadable
      Annotations
      + @Since( + + "1.6.0" + ) + +
      +
    17. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    18. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    19. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    20. + + + + + + + + + def + + + pretrained(name: String, lang: String, remoteLoc: String): E5Embeddings + + +

      Java default argument interoperability

      Java default argument interoperability

      Definition Classes
      ReadablePretrainedE5ModelHasPretrained
      +
    21. + + + + + + + + + def + + + pretrained(name: String, lang: String): E5Embeddings + + + +
    22. + + + + + + + + + def + + + pretrained(name: String): E5Embeddings + + + +
    23. + + + + + + + + + def + + + pretrained(): E5Embeddings + + +

      Java compliant-overrides

      Java compliant-overrides

      Definition Classes
      ReadablePretrainedE5ModelHasPretrained
      +
    24. + + + + + + + + + def + + + read: MLReader[E5Embeddings] + + +
      Definition Classes
      ParamsAndFeaturesReadable → DefaultParamsReadable → MLReadable
      +
    25. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    26. + + + + + + + + + def + + + toString(): String + + +
      Definition Classes
      AnyRef → Any
      +
    27. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    28. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    29. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    30. +
    +
    + + + + +
    + +
    +
    +

    Inherited from HasPretrained[E5Embeddings]

    +
    +

    Inherited from DefaultParamsReadable[E5Embeddings]

    +
    +

    Inherited from MLReadable[E5Embeddings]

    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedElmoModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedElmoModel.html index 876332ff1add79..0d13eeac14bf76 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedElmoModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedElmoModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedElmoModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedElmoModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedInstructorModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedInstructorModel.html new file mode 100644 index 00000000000000..2d290dfbbaf248 --- /dev/null +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedInstructorModel.html @@ -0,0 +1,1053 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedInstructorModel + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + +
    +
    t
    +

    com.johnsnowlabs.nlp.embeddings

    +

    ReadablePretrainedInstructorModel + + + +

    +

    +
    + +

    + + + trait + + + ReadablePretrainedInstructorModel extends ParamsAndFeaturesReadable[InstructorEmbeddings] with HasPretrained[InstructorEmbeddings] + +

    + + +
    + + Linear Supertypes + + +
    + + Known Subclasses + + +
    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. ReadablePretrainedInstructorModel
    2. HasPretrained
    3. ParamsAndFeaturesReadable
    4. DefaultParamsReadable
    5. MLReadable
    6. AnyRef
    7. Any
    8. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    + + + + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + + + + + + + + def + + + addReader(reader: (InstructorEmbeddings, String, SparkSession) ⇒ Unit): Unit + + +
      Definition Classes
      ParamsAndFeaturesReadable
      +
    5. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    6. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    7. + + + + + + + + + val + + + defaultLang: String + + +
      Definition Classes
      HasPretrained
      +
    8. + + + + + + + + + lazy val + + + defaultLoc: String + + +
      Definition Classes
      HasPretrained
      +
    9. + + + + + + + + + val + + + defaultModelName: Some[String] + + + +
    10. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    11. + + + + + + + + + def + + + equals(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    12. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    13. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    14. + + + + + + + + + def + + + hashCode(): Int + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    15. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    16. + + + + + + + + + def + + + load(path: String): InstructorEmbeddings + + +
      Definition Classes
      MLReadable
      Annotations
      + @Since( + + "1.6.0" + ) + +
      +
    17. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    18. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    19. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    20. + + + + + + + + + def + + + pretrained(name: String, lang: String, remoteLoc: String): InstructorEmbeddings + + +

      Java default argument interoperability

      Java default argument interoperability

      Definition Classes
      ReadablePretrainedInstructorModelHasPretrained
      +
    21. + + + + + + + + + def + + + pretrained(name: String, lang: String): InstructorEmbeddings + + + +
    22. + + + + + + + + + def + + + pretrained(name: String): InstructorEmbeddings + + + +
    23. + + + + + + + + + def + + + pretrained(): InstructorEmbeddings + + +

      Java compliant-overrides

      Java compliant-overrides

      Definition Classes
      ReadablePretrainedInstructorModelHasPretrained
      +
    24. + + + + + + + + + def + + + read: MLReader[InstructorEmbeddings] + + +
      Definition Classes
      ParamsAndFeaturesReadable → DefaultParamsReadable → MLReadable
      +
    25. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    26. + + + + + + + + + def + + + toString(): String + + +
      Definition Classes
      AnyRef → Any
      +
    27. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    28. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    29. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    30. +
    +
    + + + + +
    + +
    +
    +

    Inherited from HasPretrained[InstructorEmbeddings]

    +
    +

    Inherited from DefaultParamsReadable[InstructorEmbeddings]

    +
    +

    Inherited from MLReadable[InstructorEmbeddings]

    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedLongformerModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedLongformerModel.html index a8c76b3e57d5ff..55db776a9802dd 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedLongformerModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedLongformerModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedLongformerModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedLongformerModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedRobertaModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedRobertaModel.html index ef348f45957ed7..37b3403a12e7ae 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedRobertaModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedRobertaModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedRobertaModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedRobertaModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedRobertaSentenceModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedRobertaSentenceModel.html index e3769b0fca9cd5..df56ebd384846d 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedRobertaSentenceModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedRobertaSentenceModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedRobertaSentenceModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedRobertaSentenceModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedUSEModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedUSEModel.html index 212e10c887199c..56375ce8bd3fd2 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedUSEModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedUSEModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedUSEModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedUSEModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedWord2Vec.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedWord2Vec.html index 92c2188711b4ed..6ce35ee1814007 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedWord2Vec.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedWord2Vec.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedWord2Vec - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedWord2Vec + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedWordEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedWordEmbeddings.html index ed73adeb9cf690..28be250f762df3 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedWordEmbeddings.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedWordEmbeddings.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedWordEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedWordEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedXlmRobertaModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedXlmRobertaModel.html index 89da57748fece4..b904fa6ff29acd 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedXlmRobertaModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedXlmRobertaModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedXlmRobertaModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedXlmRobertaModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedXlmRobertaSentenceModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedXlmRobertaSentenceModel.html index 75f27d27a5da50..a6d308a81562e7 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedXlmRobertaSentenceModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedXlmRobertaSentenceModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedXlmRobertaSentenceModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedXlmRobertaSentenceModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedXlnetModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedXlnetModel.html index 9beb3242cb5228..06f9510319188d 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedXlnetModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedXlnetModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedXlnetModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadablePretrainedXlnetModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadsFromBytes.html b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadsFromBytes.html index e968b589b70fe8..b20329a419e87f 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/ReadsFromBytes.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/ReadsFromBytes.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadsFromBytes - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.ReadsFromBytes + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings$.html index 72a625414e3134..6f85c60311a55e 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.RoBertaEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.RoBertaEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • @@ -413,7 +437,7 @@

    Linear Supertypes - + @@ -439,7 +463,7 @@

    Inherited
      -
    1. RoBertaEmbeddings
    2. Serializable
    3. Serializable
    4. ReadRobertaDLModel
    5. ReadTensorflowModel
    6. ReadablePretrainedRobertaModel
    7. HasPretrained
    8. ParamsAndFeaturesReadable
    9. DefaultParamsReadable
    10. MLReadable
    11. AnyRef
    12. Any
    13. +
    14. RoBertaEmbeddings
    15. Serializable
    16. Serializable
    17. ReadRobertaDLModel
    18. ReadOnnxModel
    19. ReadTensorflowModel
    20. ReadablePretrainedRobertaModel
    21. HasPretrained
    22. ParamsAndFeaturesReadable
    23. DefaultParamsReadable
    24. MLReadable
    25. AnyRef
    26. Any
    @@ -818,6 +842,22 @@

    Value Members

    @native()
    +

  • + + + + + + + + + val + + + onnxFile: String + + +
    Definition Classes
    ReadRobertaDLModelReadOnnxModel
  • @@ -914,6 +954,22 @@

    Value Members

    Definition Classes
    ReadRobertaDLModel
    +
  • + + + + + + + + + def + + + readOnnxModel(path: String, spark: SparkSession, suffix: String, zipped: Boolean = true, useBundle: Boolean = false, sessionOptions: Option[SessionOptions] = None): OnnxWrapper + + +
    Definition Classes
    ReadOnnxModel
  • @@ -1110,6 +1166,8 @@

    Inherited from SerializableInherited from Serializable

    Inherited from ReadRobertaDLModel

    +
    +

    Inherited from ReadOnnxModel

    Inherited from ReadTensorflowModel

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.html index 9bacd7056ef9a2..06121192eda274 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.RoBertaEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.RoBertaEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • @@ -402,7 +426,7 @@

    class - RoBertaEmbeddings extends AnnotatorModel[RoBertaEmbeddings] with HasBatchedAnnotate[RoBertaEmbeddings] with WriteTensorflowModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine + RoBertaEmbeddings extends AnnotatorModel[RoBertaEmbeddings] with HasBatchedAnnotate[RoBertaEmbeddings] with WriteTensorflowModel with WriteOnnxModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine

    @@ -488,7 +512,7 @@

    Linear Supertypes - + @@ -514,7 +538,7 @@

    Inherited
      -
    1. RoBertaEmbeddings
    2. HasEngine
    3. HasCaseSensitiveProperties
    4. HasStorageRef
    5. HasEmbeddingsProperties
    6. HasProtectedParams
    7. WriteTensorflowModel
    8. HasBatchedAnnotate
    9. AnnotatorModel
    10. CanBeLazy
    11. RawAnnotator
    12. HasOutputAnnotationCol
    13. HasInputAnnotationCols
    14. HasOutputAnnotatorType
    15. ParamsAndFeaturesWritable
    16. HasFeatures
    17. DefaultParamsWritable
    18. MLWritable
    19. Model
    20. Transformer
    21. PipelineStage
    22. Logging
    23. Params
    24. Serializable
    25. Serializable
    26. Identifiable
    27. AnyRef
    28. Any
    29. +
    30. RoBertaEmbeddings
    31. HasEngine
    32. HasCaseSensitiveProperties
    33. HasStorageRef
    34. HasEmbeddingsProperties
    35. HasProtectedParams
    36. WriteOnnxModel
    37. WriteTensorflowModel
    38. HasBatchedAnnotate
    39. AnnotatorModel
    40. CanBeLazy
    41. RawAnnotator
    42. HasOutputAnnotationCol
    43. HasInputAnnotationCols
    44. HasOutputAnnotatorType
    45. ParamsAndFeaturesWritable
    46. HasFeatures
    47. DefaultParamsWritable
    48. MLWritable
    49. Model
    50. Transformer
    51. PipelineStage
    52. Logging
    53. Params
    54. Serializable
    55. Serializable
    56. Identifiable
    57. AnyRef
    58. Any
    @@ -2644,9 +2668,9 @@

    Value Members

  • - + - + @@ -2655,7 +2679,7 @@

    Value Members

    def - setModelIfNotSet(spark: SparkSession, tensorflowWrapper: TensorflowWrapper): RoBertaEmbeddings + setModelIfNotSet(spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper]): RoBertaEmbeddings

    @@ -3128,6 +3152,22 @@

    Value Members

    Definition Classes
    ParamsAndFeaturesWritable → DefaultParamsWritable → MLWritable
    +
  • + + + + + + + + + def + + + writeOnnxModel(path: String, spark: SparkSession, onnxWrapper: OnnxWrapper, suffix: String, fileName: String): Unit + + +
    Definition Classes
    WriteOnnxModel
  • @@ -3196,6 +3236,8 @@

    Inherited from Inherited from HasEmbeddingsProperties

    Inherited from HasProtectedParams

    +
    +

    Inherited from WriteOnnxModel

    Inherited from WriteTensorflowModel

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings$.html index e864b3a19032fc..9e5eee75969a70 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.RoBertaSentenceEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.RoBertaSentenceEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.html index 739b897cc318d4..55eb73e71fd389 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.RoBertaSentenceEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.RoBertaSentenceEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • @@ -2642,9 +2666,9 @@

    Value Members

  • - + - + @@ -2653,7 +2677,7 @@

    Value Members

    def - setModelIfNotSet(spark: SparkSession, tensorflowWrapper: TensorflowWrapper): RoBertaSentenceEmbeddings + setModelIfNotSet(spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper]): RoBertaSentenceEmbeddings

    diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/SentenceEmbeddings$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/SentenceEmbeddings$.html index 1d7dcf251ecfa1..7cd4f3da2bcc5e 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/SentenceEmbeddings$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/SentenceEmbeddings$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.SentenceEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.SentenceEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/SentenceEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/SentenceEmbeddings.html index 2992b4339b42cc..f8140137a4823f 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/SentenceEmbeddings.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/SentenceEmbeddings.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.SentenceEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.SentenceEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/UniversalSentenceEncoder$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/UniversalSentenceEncoder$.html index f824860dde9e2b..dce58dddb194d0 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/UniversalSentenceEncoder$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/UniversalSentenceEncoder$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.UniversalSentenceEncoder - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.UniversalSentenceEncoder + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/UniversalSentenceEncoder.html b/docs/api/com/johnsnowlabs/nlp/embeddings/UniversalSentenceEncoder.html index ff754894cbcb40..174bd0768e9b6b 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/UniversalSentenceEncoder.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/UniversalSentenceEncoder.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.UniversalSentenceEncoder - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.UniversalSentenceEncoder + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecApproach$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecApproach$.html index 2c23aba657c56f..8a57331ce40bec 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecApproach$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecApproach$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Word2VecApproach - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Word2VecApproach + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecApproach.html b/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecApproach.html index c7d1554a777074..0e8531e45c85e8 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecApproach.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecApproach.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Word2VecApproach - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Word2VecApproach + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecModel$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecModel$.html index e5286354da57ad..15e0a044be1beb 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecModel$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecModel$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Word2VecModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Word2VecModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecModel.html index 789b0febff4afa..2086c3995ad80f 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/Word2VecModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Word2VecModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.Word2VecModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddings$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddings$.html index 56de42e45b105e..aca1a5f16467b1 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddings$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddings$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.html index d8e4ada7a16d46..7e49327ebb7118 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddings.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsBinaryIndexer$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsBinaryIndexer$.html index 94eba4792dc13b..1e4f4758d8df83 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsBinaryIndexer$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsBinaryIndexer$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddingsBinaryIndexer - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddingsBinaryIndexer + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsModel$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsModel$.html index 75e2068b5650f9..8171b892492bd0 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsModel$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsModel$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsModel.html b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsModel.html index 7466243e6d65a7..2b251c5b4f339e 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsModel.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsModel.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsReader.html b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsReader.html index c20b8ce71e3818..3ce93b323ea05a 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsReader.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsReader.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddingsReader - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddingsReader + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsTextIndexer$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsTextIndexer$.html index 471e1e9e5c7590..cf8d0b12f8ed8a 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsTextIndexer$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsTextIndexer$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddingsTextIndexer - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddingsTextIndexer + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsWriter.html b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsWriter.html index c51bdc9c7abacc..c940ea5cdf2a3f 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsWriter.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/WordEmbeddingsWriter.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddingsWriter - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.WordEmbeddingsWriter + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings$.html index ecf85689000d09..cd7c8b7efb2a1a 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.XlmRoBertaEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.XlmRoBertaEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.html index 05cd0a3ea4b4d7..26b476b0af3eb8 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.XlmRoBertaEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.XlmRoBertaEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings$.html index ca80a0d3eed92d..f33afb2f46caac 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.XlmRoBertaSentenceEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.XlmRoBertaSentenceEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.html index a5ee7dd6601782..4ba5c1e48a1ed8 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.XlmRoBertaSentenceEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.XlmRoBertaSentenceEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings$.html b/docs/api/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings$.html index 552d5b64240daa..c24df814a518fd 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings$.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.XlnetEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.XlnetEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings.html b/docs/api/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings.html index 664f7638d1dd70..5bd582e5a8b748 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings.XlnetEmbeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings.XlnetEmbeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -181,6 +185,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -213,10 +221,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -273,10 +289,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • diff --git a/docs/api/com/johnsnowlabs/nlp/embeddings/index.html b/docs/api/com/johnsnowlabs/nlp/embeddings/index.html index a4448a8843121e..a221cd1d2a4e90 100644 --- a/docs/api/com/johnsnowlabs/nlp/embeddings/index.html +++ b/docs/api/com/johnsnowlabs/nlp/embeddings/index.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.embeddings - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.embeddings + + @@ -28,7 +28,7 @@
  • + + + E5Embeddings
  • @@ -197,6 +201,10 @@

    Packages

    HasEmbeddingsProperties +
  • + + + InstructorEmbeddings
  • @@ -229,10 +237,18 @@

    Packages

    ReadDistilBertDLModel +
  • + + + ReadE5DLModel
  • ReadElmoDLModel +
  • + + + ReadInstructorDLModel
  • @@ -289,10 +305,18 @@

    Packages

    ReadablePretrainedDoc2Vec +
  • + + + ReadablePretrainedE5Model
  • ReadablePretrainedElmoModel +
  • + + + ReadablePretrainedInstructorModel
  • @@ -393,7 +417,23 @@

    Packages

    XlnetEmbeddings -
  • +
  • + + + + + + + + + package + + + finisher + + +
    Definition Classes
    nlp
    +
  • @@ -627,7 +667,7 @@

    Type Members

    Annotators Main Page for a list of transformer based embeddings

  • - + @@ -638,7 +678,7 @@

    Type Members

    class
    - BertEmbeddings extends AnnotatorModel[BertEmbeddings] with HasBatchedAnnotate[BertEmbeddings] with WriteTensorflowModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine + BertEmbeddings extends AnnotatorModel[BertEmbeddings] with HasBatchedAnnotate[BertEmbeddings] with WriteTensorflowModel with WriteOnnxModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine

    Token-level embeddings using BERT.

    Token-level embeddings using BERT. BERT (Bidirectional Encoder Representations from @@ -712,7 +752,7 @@

    Type Members

    Annotators Main Page for a list of transformer based embeddings

  • - + @@ -723,7 +763,7 @@

    Type Members

    class
    - BertSentenceEmbeddings extends AnnotatorModel[BertSentenceEmbeddings] with HasBatchedAnnotate[BertSentenceEmbeddings] with WriteTensorflowModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine with HasProtectedParams + BertSentenceEmbeddings extends AnnotatorModel[BertSentenceEmbeddings] with HasBatchedAnnotate[BertSentenceEmbeddings] with WriteTensorflowModel with WriteOnnxModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine with HasProtectedParams

    Sentence-level embeddings using BERT.

    Sentence-level embeddings using BERT. BERT (Bidirectional Encoder Representations from @@ -787,7 +827,7 @@

    Type Members

    |[-0.6180210709571838,-0.12179657071828842,-0.191165953874588,-1.4497021436691...| |[-0.822715163230896,0.7568016648292542,-0.1165061742067337,-1.59048593044281,...| +--------------------------------------------------------------------------------+
    See also

    - BertEmbeddings for token-level embeddings

    + BertSentenceEmbeddings for sentence-level embeddings

    BertForSequenceClassification for embeddings with a sequence classification layer on top

    Annotators Main Page for a list of transformer @@ -958,7 +998,7 @@

    Type Members

    |word_embeddings|sentence .|[0.139705, 0.177955, 0.1887775, -0.45545, 0.20030999, 0.461557, -0.07891501, ...| +---------------+----------+--------------------------------------------------------------------------------+
  • - + @@ -969,7 +1009,7 @@

    Type Members

    class
    - DeBertaEmbeddings extends AnnotatorModel[DeBertaEmbeddings] with HasBatchedAnnotate[DeBertaEmbeddings] with WriteTensorflowModel with WriteSentencePieceModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine + DeBertaEmbeddings extends AnnotatorModel[DeBertaEmbeddings] with HasBatchedAnnotate[DeBertaEmbeddings] with WriteTensorflowModel with WriteOnnxModel with WriteSentencePieceModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine

    The DeBERTa model was proposed in @@ -1046,7 +1086,7 @@

    Type Members

    Annotators Main Page for a list of transformer based embeddings

  • - + @@ -1057,7 +1097,7 @@

    Type Members

    class
    - DistilBertEmbeddings extends AnnotatorModel[DistilBertEmbeddings] with HasBatchedAnnotate[DistilBertEmbeddings] with WriteTensorflowModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine + DistilBertEmbeddings extends AnnotatorModel[DistilBertEmbeddings] with HasBatchedAnnotate[DistilBertEmbeddings] with WriteTensorflowModel with WriteOnnxModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine

    DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT @@ -1255,6 +1295,78 @@

    Type Members

    +--------------------------------------------------------------------------------+ |[0.06222493574023247,0.011579325422644615,0.009919632226228714,0.109361454844...| +--------------------------------------------------------------------------------+ +
  • + + + + + + + + + class + + + E5Embeddings extends AnnotatorModel[E5Embeddings] with HasBatchedAnnotate[E5Embeddings] with WriteTensorflowModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine + + +

    Sentence embeddings using E5.

    Sentence embeddings using E5.

    E5, an instruction-finetuned text embedding model that can generate text embeddings tailored +to any task (e.g., classification, retrieval, clustering, text evaluation, etc.)

    Pretrained models can be loaded with pretrained of the companion object:

    val embeddings = E5Embeddings.pretrained()
    +  .setInputCols("document")
    +  .setOutputCol("e5_embeddings")

    The default model is "e5_small", if no name is provided.

    For available pretrained models please see the +Models Hub.

    For extended examples of usage, see +E5EmbeddingsTestSpec.

    Sources :

    Text Embeddings by Weakly-Supervised Contrastive Pre-training

    E5 Github Repository

    Paper abstract

    This paper presents E5, a family of state-of-the-art text embeddings that transfer well to a +wide range of tasks. The model is trained in a contrastive manner with weak supervision +signals from our curated large-scale text pair dataset (called CCPairs). E5 can be readily +used as a general-purpose embedding model for any tasks requiring a single-vector +representation of texts such as retrieval, clustering, and classification, achieving strong +performance in both zero-shot and fine-tuned settings. We conduct extensive evaluations on 56 +datasets from the BEIR and MTEB benchmarks. For zero-shot settings, E5 is the first model that +outperforms the strong BM25 baseline on the BEIR retrieval benchmark without using any labeled +data. When fine-tuned, E5 obtains the best results on the MTEB benchmark, beating existing +embedding models with 40× more parameters.

    Example

    import spark.implicits._
    +import com.johnsnowlabs.nlp.base.DocumentAssembler
    +import com.johnsnowlabs.nlp.annotators.Tokenizer
    +import com.johnsnowlabs.nlp.embeddings.E5Embeddings
    +import com.johnsnowlabs.nlp.EmbeddingsFinisher
    +import org.apache.spark.ml.Pipeline
    +
    +val documentAssembler = new DocumentAssembler()
    +  .setInputCol("text")
    +  .setOutputCol("document")
    +
    +val embeddings = E5Embeddings.pretrained("e5_small", "en")
    +  .setInputCols("document")
    +  .setOutputCol("e5_embeddings")
    +
    +val embeddingsFinisher = new EmbeddingsFinisher()
    +  .setInputCols("e5_embeddings")
    +  .setOutputCols("finished_embeddings")
    +  .setOutputAsVector(true)
    +
    +val pipeline = new Pipeline().setStages(Array(
    +  documentAssembler,
    +  embeddings,
    +  embeddingsFinisher
    +))
    +
    +val data = Seq("query: how much protein should a female eat",
    +"passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." +
    +But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" +
    +marathon. Check out the chart below to see how much protein you should be eating each day."
    +
    +).toDF("text")
    +val result = pipeline.fit(data).transform(data)
    +
    +result.selectExpr("explode(finished_embeddings) as result").show(1, 80)
    ++--------------------------------------------------------------------------------+
    +|                                                                          result|
    ++--------------------------------------------------------------------------------+
    +|[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...|
    +[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...|
    ++--------------------------------------------------------------------------------+
    See also

    + Annotators Main Page for a list of transformer + based embeddings

  • @@ -1371,6 +1483,79 @@

    Type Members

    +
  • + + + + + + + + + class + + + InstructorEmbeddings extends AnnotatorModel[InstructorEmbeddings] with HasBatchedAnnotate[InstructorEmbeddings] with WriteTensorflowModel with HasEmbeddingsProperties with HasStorageRef with WriteSentencePieceModel with HasCaseSensitiveProperties with HasEngine + + +

    Sentence embeddings using INSTRUCTOR.

    Sentence embeddings using INSTRUCTOR.

    Instructor👨‍🏫, an instruction-finetuned text embedding model that can generate text +embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation, +etc.) and domains (e.g., science, finance, etc.) by simply providing the task instruction, +without any finetuning. Instructor👨‍ achieves sota on 70 diverse embedding tasks!

    Pretrained models can be loaded with pretrained of the companion object:

    val embeddings = InstructorEmbeddings.pretrained()
    +  .setInputCols("document")
    +  .setOutputCol("instructor_embeddings")

    The default model is "instructor_base", if no name is provided.

    For available pretrained models please see the +Models Hub.

    For extended examples of usage, see +InstructorEmbeddingsTestSpec.

    Sources :

    One Embedder, Any Task: Instruction-Finetuned Text Embeddings

    INSTRUCTOR Github Repository

    Paper abstract

    We introduce INSTRUCTOR, a new method for computing text embeddings given task instructions: +every text input is embedded together with instructions explaining the use case (e.g., task +and domain descriptions). Unlike encoders from prior work that are more specialized, +INSTRUCTOR is a single embedder that can generate text embeddings tailored to different +downstream tasks and domains, without any further training. We first annotate instructions for +330 diverse tasks and train INSTRUCTOR on this multitask mixture with a contrastive loss. We +evaluate INSTRUCTOR on 70 embedding evaluation tasks (66 of which are unseen during training), +ranging from classification and information retrieval to semantic textual similarity and text +generation evaluation. INSTRUCTOR, while having an order of magnitude fewer parameters than +the previous best model, achieves state-of-the-art performance, with an average improvement of +3.4% compared to the previous best results on the 70 diverse datasets. Our analysis suggests +that INSTRUCTOR is robust to changes in instructions, and that instruction finetuning +mitigates the challenge of training a single model on diverse datasets. Our model, code, and +data are available at this https URL. https://instructor-embedding.github.io/

    Example

    import spark.implicits._
    +import com.johnsnowlabs.nlp.base.DocumentAssembler
    +import com.johnsnowlabs.nlp.annotators.Tokenizer
    +import com.johnsnowlabs.nlp.embeddings.InstructorEmbeddings
    +import com.johnsnowlabs.nlp.EmbeddingsFinisher
    +import org.apache.spark.ml.Pipeline
    +
    +val documentAssembler = new DocumentAssembler()
    +  .setInputCol("text")
    +  .setOutputCol("document")
    +
    +val embeddings = InstructorEmbeddings.pretrained("instructor_base", "en")
    +  .setInputCols("document")
    +  .setInstruction("Represent the Medicine sentence for clustering: ")
    +  .setOutputCol("instructor_embeddings")
    +
    +val embeddingsFinisher = new EmbeddingsFinisher()
    +  .setInputCols("instructor_embeddings")
    +  .setOutputCols("finished_embeddings")
    +  .setOutputAsVector(true)
    +
    +val pipeline = new Pipeline().setStages(Array(
    +  documentAssembler,
    +  embeddings,
    +  embeddingsFinisher
    +))
    +
    +val data = Seq("Dynamical Scalar Degree of Freedom in Horava-Lifshitz Gravity").toDF("text")
    +val result = pipeline.fit(data).transform(data)
    +
    +result.selectExpr("explode(finished_embeddings) as result").show(1, 80)
    ++--------------------------------------------------------------------------------+
    +|                                                                          result|
    ++--------------------------------------------------------------------------------+
    +|[-2.3497989177703857,0.480538547039032,-0.3238905668258667,-1.612930893898010...|
    ++--------------------------------------------------------------------------------+
    See also

    + Annotators Main Page for a list of transformer + based embeddings

  • @@ -1475,7 +1660,7 @@

    Type Members

  • - + @@ -1486,12 +1671,12 @@

    Type Members

    trait
    - ReadBertDLModel extends ReadTensorflowModel + ReadBertDLModel extends ReadTensorflowModel with ReadOnnxModel
  • - + @@ -1502,7 +1687,7 @@

    Type Members

    trait
    - ReadBertSentenceDLModel extends ReadTensorflowModel + ReadBertSentenceDLModel extends ReadTensorflowModel with ReadOnnxModel @@ -1523,7 +1708,7 @@

    Type Members

  • - + @@ -1534,12 +1719,12 @@

    Type Members

    trait
    - ReadDeBertaDLModel extends ReadTensorflowModel with ReadSentencePieceModel + ReadDeBertaDLModel extends ReadTensorflowModel with ReadSentencePieceModel with ReadOnnxModel
  • - + @@ -1550,7 +1735,23 @@

    Type Members

    trait
    - ReadDistilBertDLModel extends ReadTensorflowModel + ReadDistilBertDLModel extends ReadTensorflowModel with ReadOnnxModel + + + +
  • + + + + + + + + + trait + + + ReadE5DLModel extends ReadTensorflowModel @@ -1570,6 +1771,22 @@

    Type Members

    +
  • + + + + + + + + + trait + + + ReadInstructorDLModel extends ReadTensorflowModel with ReadSentencePieceModel + + +
  • @@ -1587,7 +1804,7 @@

    Type Members

  • - + @@ -1598,7 +1815,7 @@

    Type Members

    trait
    - ReadRobertaDLModel extends ReadTensorflowModel + ReadRobertaDLModel extends ReadTensorflowModel with ReadOnnxModel @@ -1794,6 +2011,22 @@

    Type Members

    +
  • + + + + + + + + + trait + + + ReadablePretrainedE5Model extends ParamsAndFeaturesReadable[E5Embeddings] with HasPretrained[E5Embeddings] + + +
  • @@ -1810,6 +2043,22 @@

    Type Members

    +
  • + + + + + + + + + trait + + + ReadablePretrainedInstructorModel extends ParamsAndFeaturesReadable[InstructorEmbeddings] with HasPretrained[InstructorEmbeddings] + + +
  • @@ -1971,7 +2220,7 @@

    Type Members

  • - + @@ -1982,7 +2231,7 @@

    Type Members

    class
    - RoBertaEmbeddings extends AnnotatorModel[RoBertaEmbeddings] with HasBatchedAnnotate[RoBertaEmbeddings] with WriteTensorflowModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine + RoBertaEmbeddings extends AnnotatorModel[RoBertaEmbeddings] with HasBatchedAnnotate[RoBertaEmbeddings] with WriteTensorflowModel with WriteOnnxModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties with HasEngine

    The RoBERTa model was proposed in @@ -3076,6 +3325,24 @@

    Value Members

    This is the companion object of Doc2VecModel.

    This is the companion object of Doc2VecModel. Please refer to that class for the documentation. +

    +
  • + + + + + + + + + object + + + E5Embeddings extends ReadablePretrainedE5Model with ReadE5DLModel with Serializable + + +

    This is the companion object of E5Embeddings.

    This is the companion object of E5Embeddings. Please refer to that class for the +documentation.

  • @@ -3094,6 +3361,24 @@

    Value Members

    This is the companion object of ElmoEmbeddings.

    This is the companion object of ElmoEmbeddings. Please refer to that class for the documentation. +

    +
  • + + + + + + + + + object + + + InstructorEmbeddings extends ReadablePretrainedInstructorModel with ReadInstructorDLModel with ReadSentencePieceModel with Serializable + + +

    This is the companion object of InstructorEmbeddings.

    This is the companion object of InstructorEmbeddings. Please refer to that class for the +documentation.

  • diff --git a/docs/api/com/johnsnowlabs/nlp/finisher/DocumentSimilarityRankerFinisher$.html b/docs/api/com/johnsnowlabs/nlp/finisher/DocumentSimilarityRankerFinisher$.html new file mode 100644 index 00000000000000..4ab6d5b5ba7d65 --- /dev/null +++ b/docs/api/com/johnsnowlabs/nlp/finisher/DocumentSimilarityRankerFinisher$.html @@ -0,0 +1,658 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.finisher.DocumentSimilarityRankerFinisher + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + + + +

    + + + object + + + DocumentSimilarityRankerFinisher extends DefaultParamsReadable[DocumentSimilarityRankerFinisher] with Serializable + +

    + + +
    + + Linear Supertypes + +
    Serializable, Serializable, DefaultParamsReadable[DocumentSimilarityRankerFinisher], MLReadable[DocumentSimilarityRankerFinisher], AnyRef, Any
    +
    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. +
    3. By Inheritance
    4. +
    +
    +
    + Inherited
    +
    +
      +
    1. DocumentSimilarityRankerFinisher
    2. Serializable
    3. Serializable
    4. DefaultParamsReadable
    5. MLReadable
    6. AnyRef
    7. Any
    8. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    + + + + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    4. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    5. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    6. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    7. + + + + + + + + + def + + + equals(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    8. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    9. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    10. + + + + + + + + + def + + + hashCode(): Int + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    11. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    12. + + + + + + + + + def + + + load(path: String): DocumentSimilarityRankerFinisher + + +
      Definition Classes
      MLReadable
      Annotations
      + @Since( + + "1.6.0" + ) + +
      +
    13. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    14. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    15. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    16. + + + + + + + + + def + + + read: MLReader[DocumentSimilarityRankerFinisher] + + +
      Definition Classes
      DefaultParamsReadable → MLReadable
      +
    17. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    18. + + + + + + + + + def + + + toString(): String + + +
      Definition Classes
      AnyRef → Any
      +
    19. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    20. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    21. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    22. +
    +
    + + + + +
    + +
    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Serializable

    +
    +

    Inherited from DefaultParamsReadable[DocumentSimilarityRankerFinisher]

    +
    +

    Inherited from MLReadable[DocumentSimilarityRankerFinisher]

    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/nlp/finisher/DocumentSimilarityRankerFinisher.html b/docs/api/com/johnsnowlabs/nlp/finisher/DocumentSimilarityRankerFinisher.html new file mode 100644 index 00000000000000..7e1e9d66634351 --- /dev/null +++ b/docs/api/com/johnsnowlabs/nlp/finisher/DocumentSimilarityRankerFinisher.html @@ -0,0 +1,1657 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.finisher.DocumentSimilarityRankerFinisher + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + + + +

    + + + case class + + + DocumentSimilarityRankerFinisher(uid: String) extends Transformer with DefaultParamsWritable with Product with Serializable + +

    + + +
    + + Linear Supertypes + +
    Product, Equals, DefaultParamsWritable, MLWritable, Transformer, PipelineStage, Logging, Params, Serializable, Serializable, Identifiable, AnyRef, Any
    +
    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      +
    1. Grouped
    2. +
    3. Alphabetic
    4. +
    5. By Inheritance
    6. +
    +
    +
    + Inherited
    +
    +
      +
    1. DocumentSimilarityRankerFinisher
    2. Product
    3. Equals
    4. DefaultParamsWritable
    5. MLWritable
    6. Transformer
    7. PipelineStage
    8. Logging
    9. Params
    10. Serializable
    11. Serializable
    12. Identifiable
    13. AnyRef
    14. Any
    15. +
    +
    + +
      +
    1. Hide All
    2. +
    3. Show All
    4. +
    +
    +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    +
    +

    Instance Constructors

    +
    1. + + + + + + + + + new + + + DocumentSimilarityRankerFinisher() + + + +
    2. + + + + + + + + + new + + + DocumentSimilarityRankerFinisher(uid: String) + + + +
    +
    + + + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + final + def + + + !=(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    2. + + + + + + + + final + def + + + ##(): Int + + +
      Definition Classes
      AnyRef → Any
      +
    3. + + + + + + + + final + def + + + $[T](param: Param[T]): T + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    4. + + + + + + + + final + def + + + ==(arg0: Any): Boolean + + +
      Definition Classes
      AnyRef → Any
      +
    5. + + + + + + + + + val + + + FINISHED_DOC_SIM_RANKER_ID_DEFAULT: String + + + +
    6. + + + + + + + + + val + + + FINISHED_DOC_SIM_RANKER_NEIGHBORS_DEFAULT: String + + + +
    7. + + + + + + + + + val + + + LSH_ID_COL_NAME: String + + + +
    8. + + + + + + + + + val + + + LSH_NEIGHBORS_COL_NAME: String + + + +
    9. + + + + + + + + final + def + + + asInstanceOf[T0]: T0 + + +
      Definition Classes
      Any
      +
    10. + + + + + + + + final + def + + + clear(param: Param[_]): DocumentSimilarityRankerFinisher.this.type + + +
      Definition Classes
      Params
      +
    11. + + + + + + + + + def + + + clone(): AnyRef + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    12. + + + + + + + + + def + + + copy(extra: ParamMap): Transformer + + +
      Definition Classes
      DocumentSimilarityRankerFinisher → Transformer → PipelineStage → Params
      +
    13. + + + + + + + + + def + + + copyValues[T <: Params](to: T, extra: ParamMap): T + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    14. + + + + + + + + final + def + + + defaultCopy[T <: Params](extra: ParamMap): T + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    15. + + + + + + + + final + def + + + eq(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    16. + + + + + + + + + def + + + explainParam(param: Param[_]): String + + +
      Definition Classes
      Params
      +
    17. + + + + + + + + + def + + + explainParams(): String + + +
      Definition Classes
      Params
      +
    18. + + + + + + + + + val + + + extractNearestNeighbor: BooleanParam + + + +
    19. + + + + + + + + final + def + + + extractParamMap(): ParamMap + + +
      Definition Classes
      Params
      +
    20. + + + + + + + + final + def + + + extractParamMap(extra: ParamMap): ParamMap + + +
      Definition Classes
      Params
      +
    21. + + + + + + + + + def + + + finalize(): Unit + + +
      Attributes
      protected[lang]
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + classOf[java.lang.Throwable] + ) + +
      +
    22. + + + + + + + + final + def + + + get[T](param: Param[T]): Option[T] + + +
      Definition Classes
      Params
      +
    23. + + + + + + + + final + def + + + getClass(): Class[_] + + +
      Definition Classes
      AnyRef → Any
      Annotations
      + @native() + +
      +
    24. + + + + + + + + final + def + + + getDefault[T](param: Param[T]): Option[T] + + +
      Definition Classes
      Params
      +
    25. + + + + + + + + + def + + + getExtractNearestNeighbor: Boolean + + +

      Name of input annotation cols containing embeddings +

      +
    26. + + + + + + + + + def + + + getInputCols: Array[String] + + +

      Name of DocumentSimilarityRankerFinisher output cols +

      +
    27. + + + + + + + + final + def + + + getOrDefault[T](param: Param[T]): T + + +
      Definition Classes
      Params
      +
    28. + + + + + + + + + def + + + getOutputCols: Array[String] + + +

      Name of input annotation cols containing embeddings +

      +
    29. + + + + + + + + + def + + + getParam(paramName: String): Param[Any] + + +
      Definition Classes
      Params
      +
    30. + + + + + + + + final + def + + + hasDefault[T](param: Param[T]): Boolean + + +
      Definition Classes
      Params
      +
    31. + + + + + + + + + def + + + hasParam(paramName: String): Boolean + + +
      Definition Classes
      Params
      +
    32. + + + + + + + + + def + + + initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    33. + + + + + + + + + def + + + initializeLogIfNecessary(isInterpreter: Boolean): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    34. + + + + + + + + + val + + + inputCols: StringArrayParam + + +

      Name of input annotation cols containing embeddings +

      +
    35. + + + + + + + + final + def + + + isDefined(param: Param[_]): Boolean + + +
      Definition Classes
      Params
      +
    36. + + + + + + + + final + def + + + isInstanceOf[T0]: Boolean + + +
      Definition Classes
      Any
      +
    37. + + + + + + + + final + def + + + isSet(param: Param[_]): Boolean + + +
      Definition Classes
      Params
      +
    38. + + + + + + + + + def + + + isTraceEnabled(): Boolean + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    39. + + + + + + + + + def + + + log: Logger + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    40. + + + + + + + + + def + + + logDebug(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    41. + + + + + + + + + def + + + logDebug(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    42. + + + + + + + + + def + + + logError(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    43. + + + + + + + + + def + + + logError(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    44. + + + + + + + + + def + + + logInfo(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    45. + + + + + + + + + def + + + logInfo(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    46. + + + + + + + + + def + + + logName: String + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    47. + + + + + + + + + def + + + logTrace(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    48. + + + + + + + + + def + + + logTrace(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    49. + + + + + + + + + def + + + logWarning(msg: ⇒ String, throwable: Throwable): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    50. + + + + + + + + + def + + + logWarning(msg: ⇒ String): Unit + + +
      Attributes
      protected
      Definition Classes
      Logging
      +
    51. + + + + + + + + final + def + + + ne(arg0: AnyRef): Boolean + + +
      Definition Classes
      AnyRef
      +
    52. + + + + + + + + final + def + + + notify(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    53. + + + + + + + + final + def + + + notifyAll(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @native() + +
      +
    54. + + + + + + + + + val + + + outputCols: StringArrayParam + + +

      Name of DocumentSimilarityRankerFinisher output cols +

      +
    55. + + + + + + + + + lazy val + + + params: Array[Param[_]] + + +
      Definition Classes
      Params
      +
    56. + + + + + + + + + def + + + save(path: String): Unit + + +
      Definition Classes
      MLWritable
      Annotations
      + @Since( + + "1.6.0" + ) + + @throws( + + ... + ) + +
      +
    57. + + + + + + + + final + def + + + set(paramPair: ParamPair[_]): DocumentSimilarityRankerFinisher.this.type + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    58. + + + + + + + + final + def + + + set(param: String, value: Any): DocumentSimilarityRankerFinisher.this.type + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    59. + + + + + + + + final + def + + + set[T](param: Param[T], value: T): DocumentSimilarityRankerFinisher.this.type + + +
      Definition Classes
      Params
      +
    60. + + + + + + + + final + def + + + setDefault(paramPairs: ParamPair[_]*): DocumentSimilarityRankerFinisher.this.type + + +
      Attributes
      protected
      Definition Classes
      Params
      +
    61. + + + + + + + + final + def + + + setDefault[T](param: Param[T], value: T): DocumentSimilarityRankerFinisher.this.type + + +
      Attributes
      protected[org.apache.spark.ml]
      Definition Classes
      Params
      +
    62. + + + + + + + + + def + + + setExtractNearestNeighbor(value: Boolean): DocumentSimilarityRankerFinisher.this.type + + +

      Set flag to extract best neighbor with distance +

      +
    63. + + + + + + + + + def + + + setInputCols(value: String*): DocumentSimilarityRankerFinisher.this.type + + +

      Name of input annotation cols containing similar documents +

      +
    64. + + + + + + + + + def + + + setInputCols(value: Array[String]): DocumentSimilarityRankerFinisher.this.type + + +

      Name of input annotation cols containing similar documents +

      +
    65. + + + + + + + + + def + + + setOutputCols(value: String*): DocumentSimilarityRankerFinisher.this.type + + +

      Name of DocumentSimilarityRankerFinisher output cols +

      +
    66. + + + + + + + + + def + + + setOutputCols(value: Array[String]): DocumentSimilarityRankerFinisher.this.type + + +

      Name of DocumentSimilarityRankerFinisher output cols +

      +
    67. + + + + + + + + final + def + + + synchronized[T0](arg0: ⇒ T0): T0 + + +
      Definition Classes
      AnyRef
      +
    68. + + + + + + + + + def + + + toString(): String + + +
      Definition Classes
      Identifiable → AnyRef → Any
      +
    69. + + + + + + + + + def + + + transform(dataset: Dataset[_]): DataFrame + + +
      Definition Classes
      DocumentSimilarityRankerFinisher → Transformer
      +
    70. + + + + + + + + + def + + + transform(dataset: Dataset[_], paramMap: ParamMap): DataFrame + + +
      Definition Classes
      Transformer
      Annotations
      + @Since( + + "2.0.0" + ) + +
      +
    71. + + + + + + + + + def + + + transform(dataset: Dataset[_], firstParamPair: ParamPair[_], otherParamPairs: ParamPair[_]*): DataFrame + + +
      Definition Classes
      Transformer
      Annotations
      + @Since( + + "2.0.0" + ) + + @varargs() + +
      +
    72. + + + + + + + + + def + + + transformSchema(schema: StructType): StructType + + +
      Definition Classes
      DocumentSimilarityRankerFinisher → PipelineStage
      +
    73. + + + + + + + + + def + + + transformSchema(schema: StructType, logging: Boolean): StructType + + +
      Attributes
      protected
      Definition Classes
      PipelineStage
      Annotations
      + @DeveloperApi() + +
      +
    74. + + + + + + + + + val + + + uid: String + + +
      Definition Classes
      DocumentSimilarityRankerFinisher → Identifiable
      +
    75. + + + + + + + + final + def + + + wait(): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    76. + + + + + + + + final + def + + + wait(arg0: Long, arg1: Int): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + +
      +
    77. + + + + + + + + final + def + + + wait(arg0: Long): Unit + + +
      Definition Classes
      AnyRef
      Annotations
      + @throws( + + ... + ) + + @native() + +
      +
    78. + + + + + + + + + def + + + write: MLWriter + + +
      Definition Classes
      DefaultParamsWritable → MLWritable
      +
    79. +
    +
    + + + + +
    + +
    +
    +

    Inherited from Product

    +
    +

    Inherited from Equals

    +
    +

    Inherited from DefaultParamsWritable

    +
    +

    Inherited from MLWritable

    +
    +

    Inherited from Transformer

    +
    +

    Inherited from PipelineStage

    +
    +

    Inherited from Logging

    +
    +

    Inherited from Params

    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Serializable

    +
    +

    Inherited from Identifiable

    +
    +

    Inherited from AnyRef

    +
    +

    Inherited from Any

    +
    + +
    + +
    +
    +

    getParam

    + +
    +

    param

    + +
    +

    setParam

    + +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/nlp/finisher/index.html b/docs/api/com/johnsnowlabs/nlp/finisher/index.html new file mode 100644 index 00000000000000..f41024a95e7dab --- /dev/null +++ b/docs/api/com/johnsnowlabs/nlp/finisher/index.html @@ -0,0 +1,386 @@ + + + + + + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.finisher + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    Packages

    + +
    +
    +
    + +
    +
    p
    +

    com.johnsnowlabs.nlp

    +

    finisher + + + +

    + +
    + +

    + + + package + + + finisher + +

    + + +
    + + +
    +
    +
    + + + + + +
    +
    +
    + Ordering +
      + +
    1. Alphabetic
    2. + +
    +
    + +
    + Visibility +
    1. Public
    2. All
    +
    +
    +
    + +
    +
    + + +
    +

    Type Members

    +
    1. + + + + + + + + + case class + + + DocumentSimilarityRankerFinisher(uid: String) extends Transformer with DefaultParamsWritable with Product with Serializable + + + +
    +
    + + + +
    +

    Value Members

    +
      +
    1. + + + + + + + + + object + + + DocumentSimilarityRankerFinisher extends DefaultParamsReadable[DocumentSimilarityRankerFinisher] with Serializable + + + +
    2. +
    +
    + + + + +
    + +
    + + +
    + +
    +
    +

    Ungrouped

    + +
    +
    + +
    + +
    + + + +
    +
    +
    + + diff --git a/docs/api/com/johnsnowlabs/nlp/functions$$EachAnnotations.html b/docs/api/com/johnsnowlabs/nlp/functions$$EachAnnotations.html index c713a95812d6af..a5a13097efc391 100644 --- a/docs/api/com/johnsnowlabs/nlp/functions$$EachAnnotations.html +++ b/docs/api/com/johnsnowlabs/nlp/functions$$EachAnnotations.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.functions.EachAnnotations - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.functions.EachAnnotations + + @@ -28,7 +28,7 @@
  • + + + + + + + + + package + + + finisher + +
    Definition Classes
    nlp
  • diff --git a/docs/api/com/johnsnowlabs/nlp/index.html b/docs/api/com/johnsnowlabs/nlp/index.html index 6214071e90af07..74f8b0c85a3104 100644 --- a/docs/api/com/johnsnowlabs/nlp/index.html +++ b/docs/api/com/johnsnowlabs/nlp/index.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp + + @@ -28,7 +28,7 @@
  • + + + + + + + + + package + + + finisher + + +
  • diff --git a/docs/api/com/johnsnowlabs/nlp/pretrained/PretrainedPipeline$.html b/docs/api/com/johnsnowlabs/nlp/pretrained/PretrainedPipeline$.html index 6bfd0dc52d7a8e..03d8d29c56c715 100644 --- a/docs/api/com/johnsnowlabs/nlp/pretrained/PretrainedPipeline$.html +++ b/docs/api/com/johnsnowlabs/nlp/pretrained/PretrainedPipeline$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.pretrained.PretrainedPipeline - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.pretrained.PretrainedPipeline + + @@ -28,7 +28,7 @@
  • + + + + + + + + + package + + + finisher + +
    Definition Classes
    nlp
  • diff --git a/docs/api/com/johnsnowlabs/nlp/recursive/index.html b/docs/api/com/johnsnowlabs/nlp/recursive/index.html index 7e293d2f747b5d..0db88bcc73c972 100644 --- a/docs/api/com/johnsnowlabs/nlp/recursive/index.html +++ b/docs/api/com/johnsnowlabs/nlp/recursive/index.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.recursive - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.recursive + + @@ -28,7 +28,7 @@
  • + + + + + + + + + package + + + finisher + +
    Definition Classes
    nlp
  • diff --git a/docs/api/com/johnsnowlabs/nlp/recursive/package$$Recursive.html b/docs/api/com/johnsnowlabs/nlp/recursive/package$$Recursive.html index b0cec67aa1eb3e..8af12f8ae85283 100644 --- a/docs/api/com/johnsnowlabs/nlp/recursive/package$$Recursive.html +++ b/docs/api/com/johnsnowlabs/nlp/recursive/package$$Recursive.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.recursive.Recursive - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.recursive.Recursive + + @@ -28,7 +28,7 @@
  • + + + + + + + + + package + + + finisher + +
    Definition Classes
    nlp
  • diff --git a/docs/api/com/johnsnowlabs/nlp/training/CoNLL.html b/docs/api/com/johnsnowlabs/nlp/training/CoNLL.html index 8724262b966b01..0d9f1983e10abe 100644 --- a/docs/api/com/johnsnowlabs/nlp/training/CoNLL.html +++ b/docs/api/com/johnsnowlabs/nlp/training/CoNLL.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.training.CoNLL - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.training.CoNLL + + @@ -28,7 +28,7 @@
  • + + + + + + + + + package + + + finisher + +
    Definition Classes
    nlp
  • diff --git a/docs/api/com/johnsnowlabs/nlp/util/FinisherUtil$.html b/docs/api/com/johnsnowlabs/nlp/util/FinisherUtil$.html index be96d6ea13c7f1..c3379fca71aab0 100644 --- a/docs/api/com/johnsnowlabs/nlp/util/FinisherUtil$.html +++ b/docs/api/com/johnsnowlabs/nlp/util/FinisherUtil$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.util.FinisherUtil - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.util.FinisherUtil + + @@ -28,7 +28,7 @@
  • + + + + + + + + + package + + + finisher + +
    Definition Classes
    nlp
  • diff --git a/docs/api/com/johnsnowlabs/nlp/util/io/ExternalResource$.html b/docs/api/com/johnsnowlabs/nlp/util/io/ExternalResource$.html index d14c1f057506c0..365aa4480ad6d0 100644 --- a/docs/api/com/johnsnowlabs/nlp/util/io/ExternalResource$.html +++ b/docs/api/com/johnsnowlabs/nlp/util/io/ExternalResource$.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.4 ScalaDoc - com.johnsnowlabs.nlp.util.io.ExternalResource - - + Spark NLP 5.0.0 ScalaDoc - com.johnsnowlabs.nlp.util.io.ExternalResource + + @@ -28,7 +28,7 @@
  • DistilBertEmbeddings (class in sparknlp.annotator.embeddings.distil_bert_embeddings)
  • - - +