From 2a3d64ab51b14fa9f0daeead7bda3cd9b9fd122b Mon Sep 17 00:00:00 2001 From: Saif Addin Date: Thu, 9 Aug 2018 19:46:46 -0300 Subject: [PATCH 1/2] Updated version numbers, ready to release --- CHANGELOG | 43 ++++ README.md | 28 +-- build.sbt | 4 +- docs/index.html | 4 +- docs/notebooks.html | 18 +- docs/quickstart.html | 24 +-- .../ModelDownloaderExample.ipynb | 20 +- .../example/model-downloader/assertion.ipynb | 196 ------------------ python/example/model-downloader/dl-ner.ipynb | 2 +- python/setup.py | 2 +- .../scala/com/johnsnowlabs/util/Build.scala | 2 +- 11 files changed, 104 insertions(+), 239 deletions(-) delete mode 100644 python/example/model-downloader/assertion.ipynb diff --git a/CHANGELOG b/CHANGELOG index 73bcbf94f75009..a915a790d72627 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,46 @@ +======== +1.6.1 +======== + +--------------- +New features +--------------- +* new Scala-only functions that make it easier to work with Annotations in Dataframes. May be imported through com.johnsnowlabs.nlp.functions._ and allow mapping and filtering within and outside Annotations. +filterByAnnotations, mapAnnotations and explodeAnnotations work by providing a column and a function. Check out documentation. Possibly later coming to Python. + +--------------- +Bug fixes +--------------- +* Fixed incorrect filesystem readings in some S3 environments for word embeddings +* Fixed NerCRF not correctly training from CONLL, labeling everything as -O- (Thanks @arnound from Slack Channel) + +--------------- +Enhancements +--------------- +* Added overrideable config sparknlp.settings.cluster_tmp_dir allows setting cluster location for temporary embeddings file. May help S3 based clusters with no fs.defaultFS set to a proper distributed storage. +* New annotator type: CHUNK. Representes a SUBSTRING of DOCUMENT and it is used as output from NerConverter, TextMatcher, RegexMatcher and other annotators that retrieve a substring from the original document. +This will make for better modularity and integration within various annotators, such as between NER and AssertionStatus. +* New annotation transformer: ChunkAssembler. Takes a string or array(string) column from a dataset and creates a CHUNK type annotator. The content must also belong to the current DOCUMENT annotation's content. +* SentenceDetector new param explodeSentences allow to explode sentences within a single row into different rows to increase parallelism and performance in some scenarios. Particularly OCR based. +* AssertionDLApproach now may be used within LightPipelines +* AssertionDLApproach and AssertionLogRegApproach now work from CHUNK type instead of start/end bounds. May still be trained with Start/end though. This means target for assertion may be any CHUNK output annotator now (e.g. RegexMatcher) + +--------------- +Other +--------------- +* PerceptronApproachLegacy moved back to default PerceptronApproach. Distributed PerceptronApproach moved to PerceptronApproachDistributed due to not meeting accuracy expectations yet. +* Some configuration parameters in application.conf have been appropriately moved to proper annotator Params (NorvigSweeting Spell Checker, Vivekn Approach and Sentiment Detector affected) +* application.conf renamed configuration values for better consistency + +--------------- +Developer API +--------------- +* Added beforeAnnotate() and afterAnnotate() to manipulate dataframes after or before calling annotate() UDF +* Added extraValidate() and extraValidateMsg() in all annotators to provide developer to add additional SCHEMA checks in transformSchema() stage +* Removed validation() stage in fit() stage. Allows for more flexible training when some of the columns are not really required yet. +* WrapColumnMetadata() will wrap an Annotation column with its appropriate Metadata. Makes it easier not to forget about Metadata in Schema. +* RawAnnotator trait has now all the basics needed to start a new Annotator without annotate() function. It is a complete previous stage before AnnotatorModel, which inherits from RawAnnotator. + ======== 1.6.0 ======== diff --git a/README.md b/README.md index a3bab208e6b35f..a59fad4a9da240 100644 --- a/README.md +++ b/README.md @@ -14,18 +14,18 @@ Questions? Feedback? Request access sending an email to nlp@johnsnowlabs.com This library has been uploaded to the spark-packages repository https://spark-packages.org/package/JohnSnowLabs/spark-nlp . -To use the most recent version just add the `--packages JohnSnowLabs:spark-nlp:1.6.0` to you spark command +To use the most recent version just add the `--packages JohnSnowLabs:spark-nlp:1.6.1` to you spark command ```sh -spark-shell --packages JohnSnowLabs:spark-nlp:1.6.0 +spark-shell --packages JohnSnowLabs:spark-nlp:1.6.1 ``` ```sh -pyspark --packages JohnSnowLabs:spark-nlp:1.6.0 +pyspark --packages JohnSnowLabs:spark-nlp:1.6.1 ``` ```sh -spark-submit --packages JohnSnowLabs:spark-nlp:1.6.0 +spark-submit --packages JohnSnowLabs:spark-nlp:1.6.1 ``` ## Jupyter Notebook @@ -35,23 +35,23 @@ export SPARK_HOME=/path/to/your/spark/folder export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages JohnSnowLabs:spark-nlp:1.6.0 +pyspark --packages JohnSnowLabs:spark-nlp:1.6.1 ``` ## Apache Zeppelin This way will work for both Scala and Python ``` -export SPARK_SUBMIT_OPTIONS="--packages JohnSnowLabs:spark-nlp:1.6.0" +export SPARK_SUBMIT_OPTIONS="--packages JohnSnowLabs:spark-nlp:1.6.1" ``` Alternatively, add the following Maven Coordinates to the interpreter's library list ``` -com.johnsnowlabs.nlp:spark-nlp_2.11:1.6.0 +com.johnsnowlabs.nlp:spark-nlp_2.11:1.6.1 ``` ## Python without explicit Spark installation If you installed pyspark through pip, you can now install sparknlp through pip ``` -pip install --index-url https://test.pypi.org/simple/ spark-nlp==1.6.0 +pip install --index-url https://test.pypi.org/simple/ spark-nlp==1.6.1 ``` Then you'll have to create a SparkSession manually, for example: ``` @@ -67,11 +67,11 @@ spark = SparkSession.builder \ ## Pre-compiled Spark-NLP and Spark-NLP-OCR You may download fat-jar from here: -[Spark-NLP 1.6.0 FAT-JAR](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/spark-nlp-assembly-1.6.0.jar) +[Spark-NLP 1.6.1 FAT-JAR](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/spark-nlp-assembly-1.6.1.jar) or non-fat from here -[Spark-NLP 1.6.0 PKG JAR](http://repo1.maven.org/maven2/com/johnsnowlabs/nlp/spark-nlp_2.11/1.6.0/spark-nlp_2.11-1.6.0.jar) +[Spark-NLP 1.6.1 PKG JAR](http://repo1.maven.org/maven2/com/johnsnowlabs/nlp/spark-nlp_2.11/1.6.1/spark-nlp_2.11-1.6.1.jar) Spark-NLP-OCR Module (Requires native Tesseract 4.x+ for image based OCR. Does not require Spark-NLP to work but highly suggested) -[Spark-NLP-OCR 1.6.0 FAT-JAR](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/spark-nlp-ocr-assembly-1.6.0.jar) +[Spark-NLP-OCR 1.6.1 FAT-JAR](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/spark-nlp-ocr-assembly-1.6.1.jar) ## Maven central @@ -83,19 +83,19 @@ Our package is deployed to maven central. In order to add this package as a depe com.johnsnowlabs.nlp spark-nlp_2.11 - 1.6.0 + 1.6.1 ``` #### SBT ```sbtshell -libraryDependencies += "com.johnsnowlabs.nlp" % "spark-nlp_2.11" % "1.6.0" +libraryDependencies += "com.johnsnowlabs.nlp" % "spark-nlp_2.11" % "1.6.1" ``` If you are using `scala 2.11` ```sbtshell -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "1.6.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "1.6.1" ``` ## Using the jar manually diff --git a/build.sbt b/build.sbt index 76800b6f0c816b..f81b9aa039c139 100644 --- a/build.sbt +++ b/build.sbt @@ -9,7 +9,7 @@ name := "spark-nlp" organization := "com.johnsnowlabs.nlp" -version := "1.6.0" +version := "1.6.1" scalaVersion in ThisBuild := scalaVer @@ -137,7 +137,7 @@ assemblyMergeStrategy in assembly := { lazy val ocr = (project in file("ocr")) .settings( name := "spark-nlp-ocr", - version := "1.6.0", + version := "1.6.1", libraryDependencies ++= ocrDependencies ++ analyticsDependencies ++ testDependencies, diff --git a/docs/index.html b/docs/index.html index 8938713c76c817..7075d9a7191c91 100644 --- a/docs/index.html +++ b/docs/index.html @@ -78,8 +78,8 @@

High Performance NLP with Apache Spark

Questions? Join our Slack

-

2018 Jul 7th - Update! 1.6.0 Released! OCR PDF to Spark-NLP capabilities, new Chunker annotator, fixed AWS compatibility, better performance and much more. - Learn changes HERE and check out for updated documentation below

+

2018 Aug 9th - Update! 1.6.1 Released! Fixed S3-based clusters support, new CHUNK type annotation and more! + Learn changes HERE and check out for updated documentation below

diff --git a/docs/notebooks.html b/docs/notebooks.html index 359e85b4494b30..384797b1a3bb52 100644 --- a/docs/notebooks.html +++ b/docs/notebooks.html @@ -103,7 +103,7 @@

Vivekn Sentiment Analysis< Since we are dealing with small amounts of data, we put in practice LightPipelines.

- Take me to notebook! + Take me to notebook!

@@ -135,7 +135,7 @@

Vivekn Sentiment Analysis

better Sentiment Analysis accuracy

- Take me to notebook! + Take me to notebook!

@@ -157,7 +157,7 @@

Rule-based Sentiment Analysis Each of these sentences will be used for giving a score to text

- Take me to notebook! + Take me to notebook!

@@ -177,7 +177,7 @@

CRF Named Entity Recognition

- Take me to notebook! + Take me to notebook!

@@ -196,7 +196,7 @@

CNN Deep Learning NER

and it will leverage batch-based distributed calls to native TensorFlow libraries during prediction.

- Take me to notebook! + Take me to notebook!

@@ -211,7 +211,7 @@

Simple Text Matching

This annotator is an AnnotatorModel and does not require training.

- Take me to notebook! + Take me to notebook!

@@ -226,7 +226,7 @@

Assertion Status with LogReg< dataset will return the appropriate result.

- Take me to notebook! + Take me to notebook!

@@ -241,7 +241,7 @@

Deep Learning Assertion Sta graphs may be redesigned if needed.

- Take me to notebook! + Take me to notebook!

@@ -260,7 +260,7 @@

Retrieving Pretrained models Such components may then be injected seamlessly into further pipelines, and so on.

- Take me to notebook! + Take me to notebook!

diff --git a/docs/quickstart.html b/docs/quickstart.html index 89f7c7224f010b..0f435259cb9b25 100644 --- a/docs/quickstart.html +++ b/docs/quickstart.html @@ -95,9 +95,9 @@

Requirements

To start using the library, execute any of the following lines depending on your desired use case:

-
spark-shell --packages JohnSnowLabs:spark-nlp:1.6.0
-pyspark --packages JohnSnowLabs:spark-nlp:1.6.0
-spark-submit --packages JohnSnowLabs:spark-nlp:1.6.0
+                                
spark-shell --packages JohnSnowLabs:spark-nlp:1.6.1
+pyspark --packages JohnSnowLabs:spark-nlp:1.6.1
+spark-submit --packages JohnSnowLabs:spark-nlp:1.6.1
 
NOTE: Spark packages --packages has been reported to work unproperly, particularly in python, when utilizing physical clusters. Utilizing --jars is advised. For python, add python Spark-NLP through pip
@@ -105,35 +105,35 @@

Requirements

Using Databricks cloud cluster or an Apache Zeppelin Scala notebook? Add the following Maven coordinates in the appropriate menu:

-
com.johnsnowlabs.nlp:spark-nlp_2.11:1.6.0
+
com.johnsnowlabs.nlp:spark-nlp_2.11:1.6.1

For Python in Apache Zeppelin you may need to setup SPARK_SUBMIT_OPTIONS utilizing --packages instruction shown above like this

-
export SPARK_SUBMIT_OPTIONS="--packages JohnSnowLabs:spark-nlp:1.6.0"
+
export SPARK_SUBMIT_OPTIONS="--packages JohnSnowLabs:spark-nlp:1.6.1"

Python Jupyter Notebook with PySpark? add the following environment variables (depending on your OS)

export SPARK_HOME=/path/to/your/spark/folder
 export PYSPARK_DRIVER_PYTHON=jupyter
 export PYSPARK_DRIVER_PYTHON_OPTS=notebook
 
-pyspark --packages JohnSnowLabs:spark-nlp:1.6.0
+pyspark --packages JohnSnowLabs:spark-nlp:1.6.1

Python without explicit Spark Installationk? Use pip to install (after you pip installed pyspark)

-
pip install --index-url https://test.pypi.org/simple/ spark-nlp==1.6.0
+
pip install --index-url https://test.pypi.org/simple/ spark-nlp==1.6.1

In this way, you will have to start SparkSession in your python program manually, this is an example

spark = SparkSession.builder \
     .appName("ner")\
     .master("local[*]")\
     .config("spark.driver.memory","4G")\
     .config("spark.driver.maxResultSize", "2G") \
-    .config("spark.driver.extraClassPath", "lib/spark-nlp-assembly-1.6.0.jar")\
+    .config("spark.driver.extraClassPath", "lib/spark-nlp-assembly-1.6.1.jar")\
     .config("spark.kryoserializer.buffer.max", "500m")\
     .getOrCreate()

Pre-compiled Spark-NLP assembly fat-jar for using in standalone projects, may be downloaded - here + here Non-fat-jar may be downloaded - here + here then, run spark-shell or spark-submit with appropriate --jars - /path/to/spark-nlp_2.11-1.6.0.jar to use the library in spark. + /path/to/spark-nlp_2.11-1.6.1.jar to use the library in spark.

For further alternatives and documentation check out our README page in GitHub. @@ -419,7 +419,7 @@

Utilizing Spark-NLP OCR PDF Converter

Installing Spark-NLP OCRHelper

First, either build from source or download the following standalone jar module (works both from Spark-NLP python and scala): - Spark-NLP-OCR + Spark-NLP-OCR And add it to your Spark environment (with --jars or spark.driver.extraClassPath and spark.executor.extraClassPath configuration) Second, if your PDFs don't have a text layer (this depends on how PDFs were created), the library will use Tesseract 4.0 on background. Tesseract will utilize native libraries, so you'll have to get them installed in your system. diff --git a/python/example/model-downloader/ModelDownloaderExample.ipynb b/python/example/model-downloader/ModelDownloaderExample.ipynb index 78159e5bf90d17..6d85e8974eb9da 100644 --- a/python/example/model-downloader/ModelDownloaderExample.ipynb +++ b/python/example/model-downloader/ModelDownloaderExample.ipynb @@ -209,6 +209,24 @@ "ner_tagged.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lets try a sentiment analysis pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.pretrained.pipeline.en import SentimentPipeline\n", + "\n", + "SentimentPipeline.annotate(\"This is a good movie!!!\")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -233,7 +251,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.6" } }, "nbformat": 4, diff --git a/python/example/model-downloader/assertion.ipynb b/python/example/model-downloader/assertion.ipynb deleted file mode 100644 index 30cd32e8a27587..00000000000000 --- a/python/example/model-downloader/assertion.ipynb +++ /dev/null @@ -1,196 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Show how to use pretrained assertion status" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.append('../../')\n", - "\n", - "from pyspark.sql import SparkSession\n", - "from pyspark.ml import PipelineModel\n", - "\n", - "from sparknlp.annotator import *\n", - "from sparknlp.common import *\n", - "from sparknlp.base import *\n", - "from sparknlp.pretrained import ResourceDownloader\n", - "\n", - "from pathlib import Path\n", - "\n", - "if sys.version_info[0] < 3:\n", - " from urllib import urlretrieve\n", - "else:\n", - " from urllib.request import urlretrieve" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "spark = SparkSession.builder \\\n", - " .appName(\"assertion-status\")\\\n", - " .master(\"local[1]\")\\\n", - " .config(\"spark.driver.memory\",\"4G\")\\\n", - " .config(\"spark.driver.maxResultSize\", \"2G\") \\\n", - " .config(\"spark.driver.extraClassPath\", \"lib/sparknlp.jar\")\\\n", - " .getOrCreate()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create some data for testing purposes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql import Row\n", - "R = Row('sentence', 'start', 'end')\n", - "test_data = spark.createDataFrame([R('Sister with stomach cancer .',2,3),\n", - " R('A thallium stress test showed tachycardia and severe dyspnea',5,5),\n", - " R('Positive for shortness of breath, no cough',2,4),\n", - " R('Positive for shortness of breath, no cough',7,7)])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create some pipelines, one for each type of assertion classification algorithm, model download can take some time." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "\n", - "# instantiate the downloader\n", - "downloader = ResourceDownloader()\n", - "\n", - "documentAssembler = DocumentAssembler() \\\n", - " .setInputCol(\"sentence\") \\\n", - " .setOutputCol(\"document\")\n", - "\n", - "# download bidirectional lstm based assertion status trained on negex dataset\n", - "assertion_fast_dl = downloader.downloadModel(AssertionDLModel, \"as_fast_dl\", \"en\") \\\n", - " .setInputCols([\"document\"]) \\\n", - " .setOutputCol(\"assertion\") \\\n", - " \n", - "\n", - "finisher = Finisher() \\\n", - " .setInputCols([\"assertion\"]) \\\n", - " .setIncludeKeys(True)\n", - "\n", - "pipeline_fast_dl = PipelineModel(stages = [documentAssembler, assertion_fast_dl, finisher])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's use these pipelines and see the results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pipeline_fast_dl.transform(test_data).show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# download logistic regression based assertion status trained on negex dataset\n", - "assertion_fast_lg = downloader.downloadModel(AssertionLogRegModel, \"as_fast_lg\", \"en\") \\\n", - " .setInputCols([\"document\"]) \\\n", - " .setOutputCol(\"assertion\") \\\n", - "\n", - "pipeline_fast_lg = PipelineModel(stages = [documentAssembler, assertion_fast_lg, finisher])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pipeline_fast_lg.transform(test_data).show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# download bidirectional lstm based assertion status trained on i2b2 dataset\n", - "assertion_full_dl = downloader.downloadModel(AssertionDLModel, \"as_fast_dl\", \"en\") \\\n", - " .setInputCols([\"document\"])\\\n", - " .setOutputCol(\"assertion\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pipeline_full_dl = PipelineModel(stages = [documentAssembler, assertion_full_dl, finisher])\n", - "pipeline_full_dl.transform(test_data).show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/python/example/model-downloader/dl-ner.ipynb b/python/example/model-downloader/dl-ner.ipynb index 2289fa9035d364..26edd2a9ec3ef4 100644 --- a/python/example/model-downloader/dl-ner.ipynb +++ b/python/example/model-downloader/dl-ner.ipynb @@ -139,7 +139,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.6" } }, "nbformat": 4, diff --git a/python/setup.py b/python/setup.py index ced0f1fc3241b8..3ba288838de207 100644 --- a/python/setup.py +++ b/python/setup.py @@ -40,7 +40,7 @@ # For a discussion on single-sourcing the version across setup.py and the # project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.6.0', # Required + version='1.6.1', # Required # This is a one-line description or tagline of what your project does. This # corresponds to the "Summary" metadata field: diff --git a/src/main/scala/com/johnsnowlabs/util/Build.scala b/src/main/scala/com/johnsnowlabs/util/Build.scala index 508ca9bd556cd6..9acdbdefd5829d 100644 --- a/src/main/scala/com/johnsnowlabs/util/Build.scala +++ b/src/main/scala/com/johnsnowlabs/util/Build.scala @@ -11,6 +11,6 @@ object Build { if (version != null && version.nonEmpty) version else - "1.6.0" + "1.6.1" } } \ No newline at end of file From 4f6bd68cd265bcc08491c4eee2a16eed6e3ceeea Mon Sep 17 00:00:00 2001 From: Saif Addin Date: Thu, 9 Aug 2018 20:04:20 -0300 Subject: [PATCH 2/2] Updated Changelog and added deleted pretrained models --- CHANGELOG | 8 ++++++++ python/sparknlp/annotator.py | 10 ++++++++++ src/main/scala/com/johnsnowlabs/nlp/annotator.scala | 7 ++++--- .../nlp/annotators/assertion/dl/AssertionDLModel.scala | 8 +++++++- .../assertion/logreg/AssertionLogRegModel.scala | 9 ++++++++- src/main/scala/com/johnsnowlabs/nlp/base.scala | 3 +++ 6 files changed, 40 insertions(+), 5 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index a915a790d72627..ed3101a70806c6 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,14 @@ ======== 1.6.1 ======== +--------------- +Overview +--------------- +Hi! We're glad to announce new hotfix 1.6.1. Although changes seem modest or very specific, there is a lot going underground. First of all, we've worked hard with the community to understand S3-based clusters, +which don't have a common fs.defaultFS configuration, which is the one we use to tell where is the cluster temp folder located in order to distribute word embeddings. We fixed two things here, +on one side we fixed a bug pointing to the wrong filesystem. Second, we added a custom override setting in application.conf that allows manually setting where to put temp folders in cluster. This should help S3 users. +Please share your feedback on this regard. +On the other hand, we created a new annotator type internally. The CHUNK type allows better modulary in the communication between different annotators. Impact will be noticed implicitly and over time. --------------- New features diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py index 386b6f5e796c91..bb7032f4b4e7eb 100755 --- a/python/sparknlp/annotator.py +++ b/python/sparknlp/annotator.py @@ -1059,6 +1059,11 @@ def __init__(self, java_model=None): else: super(AssertionLogRegModel, self).__init__(classname="com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegModel") + @staticmethod + def pretrained(name="as_fast_lg", language="en"): + from sparknlp.pretrained import ResourceDownloader + return ResourceDownloader.downloadModel(AssertionLogRegModel, name, language) + class NerDLApproach(AnnotatorApproach, ApproachWithEmbeddings, NerApproach): @@ -1190,3 +1195,8 @@ def __init__(self, java_model=None): super(JavaModel, self).__init__(java_model) else: super(AssertionDLModel, self).__init__(classname="com.johnsnowlabs.nlp.annotators.assertion.dl.AssertionDLModel") + + @staticmethod + def pretrained(name="as_fast_dl", language="en"): + from sparknlp.pretrained import ResourceDownloader + return ResourceDownloader.downloadModel(AssertionDLModel, name, language) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala index dc8b3754aa6d83..b1d36168734b74 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala @@ -1,7 +1,8 @@ package com.johnsnowlabs.nlp import com.johnsnowlabs.nlp.annotators.PretrainedLemmatizer -import com.johnsnowlabs.nlp.annotators.assertion.dl.ReadsAssertionGraph +import com.johnsnowlabs.nlp.annotators.assertion.dl.{PretrainedDLAssertionStatus, ReadsAssertionGraph} +import com.johnsnowlabs.nlp.annotators.assertion.logreg.PretrainedAssertionLogRegModel import com.johnsnowlabs.nlp.annotators.ner.crf.PretrainedNerCrf import com.johnsnowlabs.nlp.annotators.ner.dl.{PretrainedNerDL, ReadsNERGraph, WithGraphResolver} import com.johnsnowlabs.nlp.annotators.pos.perceptron.PretrainedPerceptronModel @@ -47,7 +48,7 @@ object annotator { type AssertionLogRegApproach = com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegApproach object AssertionLogRegApproach extends DefaultParamsReadable[AssertionLogRegApproach] type AssertionLogRegModel = com.johnsnowlabs.nlp.annotators.assertion.logreg.AssertionLogRegModel - object AssertionLogRegModel extends EmbeddingsReadable[AssertionLogRegModel] + object AssertionLogRegModel extends EmbeddingsReadable[AssertionLogRegModel] with PretrainedAssertionLogRegModel type NerCrfApproach = com.johnsnowlabs.nlp.annotators.ner.crf.NerCrfApproach object NerCrfApproach extends DefaultParamsReadable[NerCrfApproach] @@ -95,6 +96,6 @@ object annotator { type AssertionDLApproach = com.johnsnowlabs.nlp.annotators.assertion.dl.AssertionDLApproach object AssertionDLApproach extends DefaultParamsReadable[AssertionDLApproach] type AssertionDLModel = com.johnsnowlabs.nlp.annotators.assertion.dl.AssertionDLModel - object AssertionDLModel extends EmbeddingsReadable[AssertionDLModel] with ReadsAssertionGraph + object AssertionDLModel extends EmbeddingsReadable[AssertionDLModel] with ReadsAssertionGraph with PretrainedDLAssertionStatus } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/dl/AssertionDLModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/dl/AssertionDLModel.scala index b8a946d22b8662..ecd24cc62df7a8 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/dl/AssertionDLModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/dl/AssertionDLModel.scala @@ -6,6 +6,7 @@ import com.johnsnowlabs.nlp.annotators.ner.Verbose import com.johnsnowlabs.nlp.serialization.StructFeature import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.embeddings.EmbeddingsReadable +import com.johnsnowlabs.nlp.pretrained.ResourceDownloader import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql._ @@ -117,4 +118,9 @@ trait ReadsAssertionGraph extends ParamsAndFeaturesReadable[AssertionDLModel] wi addReader(readAssertionGraph) } -object AssertionDLModel extends EmbeddingsReadable[AssertionDLModel] with ReadsAssertionGraph \ No newline at end of file +trait PretrainedDLAssertionStatus { + def pretrained(name: String = "as_fast_dl", language: Option[String] = Some("en"), folder: String = ResourceDownloader.publicLoc): AssertionDLModel = + ResourceDownloader.downloadModel(AssertionDLModel, name, language, folder) +} + +object AssertionDLModel extends EmbeddingsReadable[AssertionDLModel] with ReadsAssertionGraph with PretrainedDLAssertionStatus \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala index f846448bbe3244..ead3246aae021f 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/assertion/logreg/AssertionLogRegModel.scala @@ -3,6 +3,7 @@ package com.johnsnowlabs.nlp.annotators.assertion.logreg import com.johnsnowlabs.nlp.AnnotatorType._ import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.embeddings.{EmbeddingsReadable, WordEmbeddings} +import com.johnsnowlabs.nlp.pretrained.ResourceDownloader import com.johnsnowlabs.nlp.serialization.{MapFeature, StructFeature} import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.util.Identifiable @@ -97,4 +98,10 @@ class AssertionLogRegModel(override val uid: String) extends RawAnnotator[Assert override def copy(extra: ParamMap): AssertionLogRegModel = defaultCopy(extra) } -object AssertionLogRegModel extends EmbeddingsReadable[AssertionLogRegModel] \ No newline at end of file +trait PretrainedAssertionLogRegModel { + def pretrained(name: String = "as_fast_lg", language: Option[String] = Some("en"), remoteLoc: String = ResourceDownloader.publicLoc): AssertionLogRegModel = + ResourceDownloader.downloadModel(AssertionLogRegModel, name, language, remoteLoc) +} + + +object AssertionLogRegModel extends EmbeddingsReadable[AssertionLogRegModel] with PretrainedAssertionLogRegModel diff --git a/src/main/scala/com/johnsnowlabs/nlp/base.scala b/src/main/scala/com/johnsnowlabs/nlp/base.scala index dd32b0eb3c2ed1..ad13ffa69afbb1 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/base.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/base.scala @@ -8,6 +8,9 @@ object base { type DocumentAssembler = com.johnsnowlabs.nlp.DocumentAssembler object DocumentAssembler extends DefaultParamsReadable[DocumentAssembler] + type ChunkAssembler = com.johnsnowlabs.nlp.ChunkAssembler + object ChunkAssembler extends DefaultParamsReadable[ChunkAssembler] + type TokenAssembler = com.johnsnowlabs.nlp.TokenAssembler object TokenAssembler extends DefaultParamsReadable[TokenAssembler]