diff --git a/NER_sklearn.ipynb b/NER_sklearn.ipynb new file mode 100644 index 0000000..a176a1e --- /dev/null +++ b/NER_sklearn.ipynb @@ -0,0 +1,11414 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.feature_extraction import DictVectorizer\n", + "from sklearn.feature_extraction.text import HashingVectorizer\n", + "from sklearn.linear_model import Perceptron\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import SGDClassifier\n", + "from sklearn.linear_model import PassiveAggressiveClassifier\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Sentence #WordPOSTag
0Sentence: 1ThousandsNNSO
1NaNofINO
2NaNdemonstratorsNNSO
3NaNhaveVBPO
4NaNmarchedVBNO
\n", + "
" + ], + "text/plain": [ + " Sentence # Word POS Tag\n", + "0 Sentence: 1 Thousands NNS O\n", + "1 NaN of IN O\n", + "2 NaN demonstrators NNS O\n", + "3 NaN have VBP O\n", + "4 NaN marched VBN O" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('ner_dataset.csv', encoding = \"ISO-8859-1\")\n", + "df = df[:100000]\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Sentence # 95456\n", + "Word 0\n", + "POS 0\n", + "Tag 0\n", + "dtype: int64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.fillna(method='ffill')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have 4,544 sentences that contain 10,922 unique words and tagged by 17 tags." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4544, 10922, 17)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Sentence #WordPOS
0Sentence: 1ThousandsNNS
1Sentence: 1ofIN
2Sentence: 1demonstratorsNNS
3Sentence: 1haveVBP
4Sentence: 1marchedVBN
\n", + "
" + ], + "text/plain": [ + " Sentence # Word POS\n", + "0 Sentence: 1 Thousands NNS\n", + "1 Sentence: 1 of IN\n", + "2 Sentence: 1 demonstrators NNS\n", + "3 Sentence: 1 have VBP\n", + "4 Sentence: 1 marched VBN" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = df.drop('Tag', axis=1)\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Sentence #', 'Word', 'POS'], dtype='object')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(100000, 15507)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "v = DictVectorizer(sparse=False)\n", + "X = v.fit_transform(X.to_dict('records'))\n", + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "y = df.Tag.values" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "classes = np.unique(y)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['B-art',\n", + " 'B-eve',\n", + " 'B-geo',\n", + " 'B-gpe',\n", + " 'B-nat',\n", + " 'B-org',\n", + " 'B-per',\n", + " 'B-tim',\n", + " 'I-art',\n", + " 'I-eve',\n", + " 'I-geo',\n", + " 'I-gpe',\n", + " 'I-nat',\n", + " 'I-org',\n", + " 'I-per',\n", + " 'I-tim',\n", + " 'O']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classes = classes.tolist()\n", + "classes" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((100000, 15507), (100000,))" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.shape, y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((67000, 15507), (67000,))" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape, y_train.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Perceptron" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['B-art',\n", + " 'B-eve',\n", + " 'B-geo',\n", + " 'B-gpe',\n", + " 'B-nat',\n", + " 'B-org',\n", + " 'B-per',\n", + " 'B-tim',\n", + " 'I-art',\n", + " 'I-eve',\n", + " 'I-geo',\n", + " 'I-gpe',\n", + " 'I-nat',\n", + " 'I-org',\n", + " 'I-per',\n", + " 'I-tim']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_classes = classes.copy()\n", + "new_classes.pop()\n", + "new_classes" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-- Epoch 1-- Epoch 1-- Epoch 1\n", + "-- Epoch 1\n", + "\n", + "\n", + "Norm: 11.09, NNZs: 103, Bias: -3.000000, T: 67000, Avg. loss: 0.001134Norm: 50.70, NNZs: 1350, Bias: -4.000000, T: 67000, Avg. loss: 0.014970Norm: 68.07, NNZs: 2574, Bias: -3.000000, T: 67000, Avg. loss: 0.042104\n", + "Total training time: 2.52 seconds.\n", + "\n", + "Total training time: 2.50 seconds.\n", + "\n", + "Total training time: 2.49 seconds.\n", + "Norm: 13.53, NNZs: 159, Bias: -3.000000, T: 67000, Avg. loss: 0.001701\n", + "Total training time: 2.53 seconds.\n", + "-- Epoch 1-- Epoch 1\n", + "-- Epoch 1\n", + "\n", + "-- Epoch 1\n", + "Norm: 8.37, NNZs: 59, Bias: -2.000000, T: 67000, Avg. loss: 0.000522\n", + "Total training time: 1.77 seconds.\n", + "-- Epoch 1\n", + "Norm: 45.00, NNZs: 1164, Bias: -3.000000, T: 67000, Avg. loss: 0.017567\n", + "Total training time: 1.79 seconds.\n", + "-- Epoch 1\n", + "Norm: 48.33, NNZs: 1679, Bias: -4.000000, T: 67000, Avg. loss: 0.022507\n", + "Total training time: 1.79 seconds.\n", + "-- Epoch 1\n", + "Norm: 57.04, NNZs: 2028, Bias: -5.000000, T: 67000, Avg. loss: 0.034493\n", + "Total training time: 1.85 seconds.\n", + "-- Epoch 1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Done 5 tasks | elapsed: 4.5s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Norm: 10.15, NNZs: 83, Bias: -3.000000, T: 67000, Avg. loss: 0.000806\n", + "Total training time: 1.68 seconds.\n", + "-- Epoch 1\n", + "Norm: 10.30, NNZs: 92, Bias: -2.000000, T: 67000, Avg. loss: 0.001030\n", + "Total training time: 1.69 seconds.\n", + "-- Epoch 1\n", + "Norm: 34.35, NNZs: 811, Bias: -4.000000, T: 67000, Avg. loss: 0.011851\n", + "Total training time: 1.71 seconds.\n", + "-- Epoch 1\n", + "Norm: 10.72, NNZs: 93, Bias: -3.000000, T: 67000, Avg. loss: 0.001194\n", + "Total training time: 1.68 seconds.\n", + "-- Epoch 1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Done 10 tasks | elapsed: 6.2s\n", + "[Parallel(n_jobs=-1)]: Done 12 out of 17 | elapsed: 6.2s remaining: 2.5s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Norm: 6.40, NNZs: 33, Bias: -3.000000, T: 67000, Avg. loss: 0.000194\n", + "Total training time: 1.94 seconds.\n", + "-- Epoch 1\n", + "Norm: 52.48, NNZs: 1692, Bias: -4.000000, T: 67000, Avg. loss: 0.025776\n", + "Total training time: 2.02 seconds.\n", + "Norm: 31.94, NNZs: 698, Bias: -4.000000, T: 67000, Avg. loss: 0.011791\n", + "Total training time: 2.01 seconds.\n", + "Norm: 60.29, NNZs: 2085, Bias: -5.000000, T: 67000, Avg. loss: 0.026746\n", + "Total training time: 2.09 seconds.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Done 14 out of 17 | elapsed: 8.2s remaining: 1.7s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Norm: 73.60, NNZs: 2820, Bias: 3.000000, T: 67000, Avg. loss: 0.048672\n", + "Total training time: 1.48 seconds.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Done 17 out of 17 | elapsed: 9.6s finished\n" + ] + }, + { + "data": { + "text/plain": [ + "Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,\n", + " max_iter=5, n_iter=None, n_jobs=-1, penalty=None, random_state=0,\n", + " shuffle=True, tol=None, verbose=10, warm_start=False)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)\n", + "per.partial_fit(X_train, y_train, classes)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " B-art 0.15 0.12 0.14 24\n", + " B-eve 0.46 0.32 0.37 19\n", + " B-geo 0.42 0.91 0.57 1085\n", + " B-gpe 0.89 0.78 0.83 556\n", + " B-nat 0.11 0.25 0.15 12\n", + " B-org 0.55 0.35 0.43 589\n", + " B-per 0.72 0.43 0.53 564\n", + " B-tim 0.65 0.78 0.71 611\n", + " I-art 0.02 0.08 0.03 12\n", + " I-eve 0.00 0.00 0.00 18\n", + " I-geo 0.81 0.32 0.46 230\n", + " I-gpe 0.00 0.00 0.00 14\n", + " I-nat 0.50 0.50 0.50 2\n", + " I-org 0.71 0.41 0.52 445\n", + " I-per 0.76 0.20 0.32 591\n", + " I-tim 0.26 0.05 0.09 194\n", + "\n", + "avg / total 0.62 0.55 0.53 4966\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", + " 'precision', 'predicted', average, warn_for)\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Linear classifiers with SGD training" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\stochastic_gradient.py:128: FutureWarning: max_iter and tol parameters have been added in in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.\n", + " \"and default tol will be 1e-3.\" % type(self), FutureWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,\n", + " eta0=0.0, fit_intercept=True, l1_ratio=0.15,\n", + " learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,\n", + " n_jobs=1, penalty='l2', power_t=0.5, random_state=None,\n", + " shuffle=True, tol=None, verbose=0, warm_start=False)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sgd = SGDClassifier()\n", + "sgd.partial_fit(X_train, y_train, classes)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " B-art 0.33 0.12 0.18 24\n", + " B-eve 0.67 0.11 0.18 19\n", + " B-geo 0.76 0.66 0.71 1085\n", + " B-gpe 0.86 0.63 0.73 556\n", + " B-nat 0.67 0.33 0.44 12\n", + " B-org 0.63 0.42 0.50 589\n", + " B-per 0.61 0.55 0.58 564\n", + " B-tim 0.79 0.62 0.70 611\n", + " I-art 1.00 0.08 0.15 12\n", + " I-eve 0.00 0.00 0.00 18\n", + " I-geo 0.82 0.39 0.53 230\n", + " I-gpe 0.50 0.07 0.12 14\n", + " I-nat 0.00 0.00 0.00 2\n", + " I-org 0.36 0.68 0.47 445\n", + " I-per 0.59 0.67 0.63 591\n", + " I-tim 1.00 0.01 0.02 194\n", + "\n", + "avg / total 0.69 0.56 0.59 4966\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", + " 'precision', 'predicted', average, warn_for)\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Naive Bayes classifier for multinomial models" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nb = MultinomialNB(alpha=0.01)\n", + "nb.partial_fit(X_train, y_train, classes)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " B-art 0.06 0.17 0.09 24\n", + " B-eve 0.33 0.37 0.35 19\n", + " B-geo 0.70 0.63 0.66 1085\n", + " B-gpe 0.70 0.83 0.76 556\n", + " B-nat 0.35 0.50 0.41 12\n", + " B-org 0.41 0.44 0.43 589\n", + " B-per 0.44 0.47 0.46 564\n", + " B-tim 0.56 0.61 0.59 611\n", + " I-art 0.07 0.08 0.08 12\n", + " I-eve 0.46 0.33 0.39 18\n", + " I-geo 0.40 0.52 0.46 230\n", + " I-gpe 0.13 0.14 0.14 14\n", + " I-nat 0.00 0.00 0.00 2\n", + " I-org 0.50 0.51 0.51 445\n", + " I-per 0.53 0.50 0.51 591\n", + " I-tim 0.17 0.27 0.21 194\n", + "\n", + "avg / total 0.54 0.56 0.54 4966\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Passive Aggressive Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,\n", + " fit_intercept=True, loss='hinge', max_iter=None, n_iter=None,\n", + " n_jobs=1, random_state=None, shuffle=True, tol=None,\n", + " verbose=0, warm_start=False)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pa =PassiveAggressiveClassifier()\n", + "pa.partial_fit(X_train, y_train, classes)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " B-art 0.20 0.04 0.07 24\n", + " B-eve 0.36 0.26 0.30 19\n", + " B-geo 0.70 0.65 0.67 1085\n", + " B-gpe 0.60 0.85 0.70 556\n", + " B-nat 0.20 0.67 0.31 12\n", + " B-org 0.70 0.32 0.44 589\n", + " B-per 0.57 0.56 0.57 564\n", + " B-tim 0.84 0.62 0.71 611\n", + " I-art 0.02 0.50 0.04 12\n", + " I-eve 0.83 0.28 0.42 18\n", + " I-geo 0.50 0.62 0.55 230\n", + " I-gpe 0.40 0.43 0.41 14\n", + " I-nat 0.20 0.50 0.29 2\n", + " I-org 0.78 0.28 0.41 445\n", + " I-per 0.63 0.64 0.63 591\n", + " I-tim 0.21 0.32 0.25 194\n", + "\n", + "avg / total 0.65 0.56 0.58 4966\n", + "\n" + ] + } + ], + "source": [ + "print(classification_report(y_pred=pa.predict(X_test), y_true=y_test, labels=new_classes))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Conditional Random Fields (CRFs)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn_crfsuite\n", + "from sklearn_crfsuite import scorers\n", + "from sklearn_crfsuite import metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Get sentences" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "class SentenceGetter(object):\n", + " \n", + " def __init__(self, data):\n", + " self.n_sent = 1\n", + " self.data = data\n", + " self.empty = False\n", + " agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), \n", + " s['POS'].values.tolist(), \n", + " s['Tag'].values.tolist())]\n", + " self.grouped = self.data.groupby('Sentence #').apply(agg_func)\n", + " self.sentences = [s for s in self.grouped]\n", + " \n", + " def get_next(self):\n", + " try: \n", + " s = self.grouped['Sentence: {}'.format(self.n_sent)]\n", + " self.n_sent += 1\n", + " return s \n", + " except:\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "getter = SentenceGetter(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]\n" + ] + } + ], + "source": [ + "sent = getter.get_next()\n", + "print(sent)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "sentences = getter.sentences" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Features extraction\n", + "\n", + "Next, we extract more features (word parts, simplified POS tags, lower/title/upper flags, features of nearby words) and convert them to sklear-crfsuite format - each sentence should be converted to a list of dicts." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "def word2features(sent, i):\n", + " word = sent[i][0]\n", + " postag = sent[i][1]\n", + " \n", + " features = {\n", + " 'bias': 1.0, \n", + " 'word.lower()': word.lower(), \n", + " 'word[-3:]': word[-3:],\n", + " 'word[-2:]': word[-2:],\n", + " 'word.isupper()': word.isupper(),\n", + " 'word.istitle()': word.istitle(),\n", + " 'word.isdigit()': word.isdigit(),\n", + " 'postag': postag,\n", + " 'postag[:2]': postag[:2],\n", + " }\n", + " if i > 0:\n", + " word1 = sent[i-1][0]\n", + " postag1 = sent[i-1][1]\n", + " features.update({\n", + " '-1:word.lower()': word1.lower(),\n", + " '-1:word.istitle()': word1.istitle(),\n", + " '-1:word.isupper()': word1.isupper(),\n", + " '-1:postag': postag1,\n", + " '-1:postag[:2]': postag1[:2],\n", + " })\n", + " else:\n", + " features['BOS'] = True\n", + " if i < len(sent)-1:\n", + " word1 = sent[i+1][0]\n", + " postag1 = sent[i+1][1]\n", + " features.update({\n", + " '+1:word.lower()': word1.lower(),\n", + " '+1:word.istitle()': word1.istitle(),\n", + " '+1:word.isupper()': word1.isupper(),\n", + " '+1:postag': postag1,\n", + " '+1:postag[:2]': postag1[:2],\n", + " })\n", + " else:\n", + " features['EOS'] = True\n", + "\n", + " return features\n", + "\n", + "def sent2features(sent):\n", + " return [word2features(sent, i) for i in range(len(sent))]\n", + "\n", + "def sent2labels(sent):\n", + " return [label for token, postag, label in sent]\n", + "\n", + "def sent2tokens(sent):\n", + " return [token for token, postag, label in sent]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code were taken from sklearn-crfsuite official site." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Split train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "X = [sent2features(s) for s in sentences]\n", + "y = [sent2labels(s) for s in sentences]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CRF(algorithm='lbfgs', all_possible_states=None,\n", + " all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,\n", + " calibration_candidates=None, calibration_eta=None,\n", + " calibration_max_trials=None, calibration_rate=None,\n", + " calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,\n", + " gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,\n", + " max_linesearch=None, min_freq=None, model_filename=None,\n", + " num_memories=None, pa_type=None, period=None, trainer_cls=None,\n", + " variance=None, verbose=False)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "crf = sklearn_crfsuite.CRF(\n", + " algorithm='lbfgs',\n", + " c1=0.1,\n", + " c2=0.1,\n", + " max_iterations=100,\n", + " all_possible_transitions=True\n", + ")\n", + "crf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n", + " 'precision', 'predicted', average, warn_for)\n" + ] + }, + { + "data": { + "text/plain": [ + "0.7842087494747214" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred = crf.predict(X_test)\n", + "metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=new_classes)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", + " 'precision', 'predicted', average, warn_for)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " B-art 1.00 0.03 0.07 29\n", + " B-eve 0.86 0.25 0.39 24\n", + " B-geo 0.75 0.88 0.81 1043\n", + " B-gpe 0.89 0.78 0.83 588\n", + " B-nat 0.67 0.20 0.31 10\n", + " B-org 0.75 0.64 0.69 649\n", + " B-per 0.81 0.81 0.81 546\n", + " B-tim 0.90 0.85 0.87 589\n", + " I-art 0.00 0.00 0.00 7\n", + " I-eve 0.57 0.22 0.32 18\n", + " I-geo 0.71 0.71 0.71 204\n", + " I-gpe 0.47 0.53 0.50 17\n", + " I-nat 1.00 0.50 0.67 2\n", + " I-org 0.78 0.73 0.76 545\n", + " I-per 0.80 0.90 0.85 574\n", + " I-tim 0.79 0.68 0.73 185\n", + "\n", + "avg / total 0.80 0.78 0.78 5030\n", + "\n" + ] + } + ], + "source": [ + "print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n", + " \"This module will be removed in 0.20.\", DeprecationWarning)\n", + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n", + " DeprecationWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 3 folds for each of 50 candidates, totalling 150 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=-1)]: Done 42 tasks | elapsed: 4.8min\n", + "[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 17.0min finished\n" + ] + }, + { + "data": { + "text/plain": [ + "RandomizedSearchCV(cv=3, error_score='raise',\n", + " estimator=CRF(algorithm='lbfgs', all_possible_states=None,\n", + " all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,\n", + " calibration_candidates=None, calibration_eta=None,\n", + " calibration_max_trials=None, calibration_rate=None,\n", + " calibration_samples=None, delta=None, epsilon=None, error...e,\n", + " num_memories=None, pa_type=None, period=None, trainer_cls=None,\n", + " variance=None, verbose=False),\n", + " fit_params={}, iid=True, n_iter=50, n_jobs=-1,\n", + " param_distributions={'c1': , 'c2': },\n", + " pre_dispatch='2*n_jobs', random_state=None, refit=True,\n", + " scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim']),\n", + " verbose=1)" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import scipy.stats\n", + "from sklearn.metrics import make_scorer\n", + "from sklearn.grid_search import RandomizedSearchCV\n", + "\n", + "crf = sklearn_crfsuite.CRF(\n", + " algorithm='lbfgs',\n", + " max_iterations=100,\n", + " all_possible_transitions=True\n", + ")\n", + "params_space = {\n", + " 'c1': scipy.stats.expon(scale=0.5),\n", + " 'c2': scipy.stats.expon(scale=0.05),\n", + "}\n", + "\n", + "# use the same metric for evaluation\n", + "f1_scorer = make_scorer(metrics.flat_f1_score,\n", + " average='weighted', labels=new_classes)\n", + "\n", + "# search\n", + "rs = RandomizedSearchCV(crf, params_space,\n", + " cv=3,\n", + " verbose=1,\n", + " n_jobs=-1,\n", + " n_iter=50,\n", + " scoring=f1_scorer)\n", + "rs.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "best params: {'c1': 0.0036898984638244928, 'c2': 0.11585183551331574}\n", + "best CV score: 0.7737211773297741\n", + "model size: 1.30M\n" + ] + } + ], + "source": [ + "print('best params:', rs.best_params_)\n", + "print('best CV score:', rs.best_score_)\n", + "print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", + " 'precision', 'predicted', average, warn_for)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " B-art 1.00 0.03 0.07 29\n", + " B-eve 0.83 0.21 0.33 24\n", + " B-geo 0.75 0.87 0.81 1043\n", + " B-gpe 0.88 0.78 0.83 588\n", + " B-nat 0.67 0.20 0.31 10\n", + " B-org 0.74 0.63 0.68 649\n", + " B-per 0.81 0.80 0.81 546\n", + " B-tim 0.90 0.84 0.87 589\n", + " I-art 0.00 0.00 0.00 7\n", + " I-eve 0.67 0.22 0.33 18\n", + " I-geo 0.67 0.71 0.69 204\n", + " I-gpe 0.39 0.53 0.45 17\n", + " I-nat 1.00 0.50 0.67 2\n", + " I-org 0.78 0.72 0.75 545\n", + " I-per 0.81 0.89 0.85 574\n", + " I-tim 0.79 0.66 0.72 185\n", + "\n", + "avg / total 0.80 0.78 0.78 5030\n", + "\n" + ] + } + ], + "source": [ + "crf = rs.best_estimator_\n", + "y_pred = crf.predict(X_test)\n", + "print(metrics.flat_classification_report(y_test, y_pred, labels=new_classes))" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top likely transitions:\n", + "B-geo -> I-geo 6.007604\n", + "I-geo -> I-geo 5.296245\n", + "B-art -> I-art 4.951198\n", + "B-eve -> I-eve 4.847021\n", + "I-tim -> I-tim 4.789188\n", + "B-per -> I-per 4.711716\n", + "I-art -> I-art 4.664539\n", + "B-tim -> I-tim 4.575079\n", + "B-org -> I-org 4.456466\n", + "I-org -> I-org 4.320635\n", + "I-per -> I-per 4.039724\n", + "I-gpe -> I-gpe 3.969627\n", + "I-eve -> I-eve 3.968368\n", + "B-gpe -> I-gpe 3.919860\n", + "O -> O 3.465068\n", + "B-nat -> I-nat 3.208265\n", + "O -> B-per 2.057576\n", + "B-org -> B-art 2.001540\n", + "I-nat -> I-nat 1.919624\n", + "B-geo -> B-tim 1.688412\n", + "\n", + "Top unlikely transitions:\n", + "B-gpe -> I-org -1.848015\n", + "O -> I-gpe -1.856660\n", + "B-geo -> I-gpe -1.880598\n", + "I-per -> I-org -1.889957\n", + "B-geo -> I-org -1.947059\n", + "O -> I-eve -2.033728\n", + "B-gpe -> I-geo -2.151673\n", + "I-org -> B-org -2.177301\n", + "B-org -> B-org -2.258343\n", + "O -> I-art -2.325744\n", + "B-org -> I-per -2.332204\n", + "B-tim -> B-tim -2.447829\n", + "I-org -> I-per -2.455738\n", + "I-per -> B-per -3.094530\n", + "O -> I-per -3.122940\n", + "B-gpe -> B-gpe -3.169217\n", + "O -> I-tim -4.152981\n", + "O -> I-geo -4.235485\n", + "B-per -> B-per -4.278895\n", + "O -> I-org -4.543933\n" + ] + } + ], + "source": [ + "from collections import Counter\n", + "\n", + "def print_transitions(trans_features):\n", + " for (label_from, label_to), weight in trans_features:\n", + " print(\"%-6s -> %-7s %0.6f\" % (label_from, label_to, weight))\n", + "\n", + "print(\"Top likely transitions:\")\n", + "print_transitions(Counter(crf.transition_features_).most_common(20))\n", + "\n", + "print(\"\\nTop unlikely transitions:\")\n", + "print_transitions(Counter(crf.transition_features_).most_common()[-20:])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is very likely that the beginning of a geographical entity (B-geo) will be followed by a token inside geographical entity (I-geo), but transitions to inside of an organization name (I-org) from tokens with other labels are penalized hugely." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top positive:\n", + "5.183603 B-tim word[-3:]:day\n", + "4.699027 O BOS\n", + "3.761687 O bias\n", + "3.754395 I-tim word[-3:]:day\n", + "3.593121 O word.lower():kurdish\n", + "3.584948 O word.lower():jewish\n", + "3.370614 B-per word.lower():president\n", + "3.338913 B-org word.lower():al-qaida\n", + "3.326234 B-tim word.lower():thanksgiving\n", + "3.269326 B-tim word[-2:]:ay\n", + "3.225759 O word[-2:]:N1\n", + "3.171786 B-tim +1:word.lower():year\n", + "3.119587 B-tim word.lower():afternoon\n", + "3.118231 O postag[:2]:VB\n", + "3.090609 B-org -1:word.lower():telephoned\n", + "3.081398 B-org word.lower():hamas\n", + "3.050390 B-gpe word.istitle()\n", + "3.037740 B-tim word[-2:]:0s\n", + "3.023566 B-gpe word.lower():nepal\n", + "3.003322 B-gpe word[-3:]:pal\n", + "2.998838 B-org +1:word.lower():fought\n", + "2.997746 I-geo +1:word.lower():town\n", + "2.995321 B-per word.lower():obama\n", + "2.980474 B-geo word.lower():mid-september\n", + "2.929354 B-geo -1:word.lower():serb\n", + "2.924037 I-geo +1:word.lower():achieved\n", + "2.921243 B-per BOS\n", + "2.915479 B-tim +1:word.lower():czech\n", + "2.898837 B-org -1:word.lower():brunei\n", + "2.888846 O word.lower():last\n", + "\n", + "Top negative:\n", + "-1.985977 O +1:word.lower():hours\n", + "-2.019767 O +1:word.lower():moscow\n", + "-2.034469 O -1:word.lower():year\n", + "-2.041179 O word.lower():32-year-old\n", + "-2.058087 O word.lower():later\n", + "-2.105580 O +1:word.lower():weeks\n", + "-2.155248 O +1:word.lower():monday\n", + "-2.156750 O word[-3:]:oon\n", + "-2.180948 O word.lower():evening\n", + "-2.182285 O word.lower():another\n", + "-2.191040 O word.isupper()\n", + "-2.197670 O word.lower():prime\n", + "-2.269735 O -1:word.lower():doubled\n", + "-2.297358 O word.lower():decade\n", + "-2.349126 O -1:word.lower():brunei\n", + "-2.349738 O word.lower():anniversary\n", + "-2.418581 O +1:word.lower():influence\n", + "-2.427463 O -1:word.lower():extremist\n", + "-2.431002 O +1:word.lower():czech\n", + "-2.526648 B-geo -1:word.lower():recognize\n", + "-2.645574 O +1:word.lower():mr.\n", + "-2.647633 O +1:word.lower():months\n", + "-2.708926 O word.lower():morning\n", + "-2.766577 O +1:word.lower():years\n", + "-2.818992 O +1:word.lower():year\n", + "-3.025051 O +1:word.lower():last\n", + "-3.087828 O word.isdigit()\n", + "-3.233526 O word.istitle()\n", + "-3.521244 O postag:NNP\n", + "-3.895403 O word[-2:]:0s\n" + ] + } + ], + "source": [ + "def print_state_features(state_features):\n", + " for (attr, label), weight in state_features:\n", + " print(\"%0.6f %-8s %s\" % (weight, label, attr))\n", + "\n", + "print(\"Top positive:\")\n", + "print_state_features(Counter(crf.state_features_).most_common(30))\n", + "\n", + "print(\"\\nTop negative:\")\n", + "print_state_features(Counter(crf.state_features_).most_common()[-30:])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Observations: \n", + "\n", + "1). __```5.183603 B-tim word[-3]:day```__\n", + "The model learns that if a nearby word was “day” then the token is likely a part of a Time indicator.\n", + "\n", + "2). __```3.370614 B-per word.lower():president```__\n", + "The model learns that token \"president\" is likely to be at the beginning of a person name.\n", + "\n", + "3). __```-3.521244 O postag:NNP```__\n", + "The model learns that proper nouns are often entities.\n", + "\n", + "4). __```-3.087828 O word.isdigit()```__\n", + "Digits are likely entities.\n", + "\n", + "5). __```-3.233526 O word.istitle()```__\n", + "TitleCased words are likely entities." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ELI5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ELI5 is a Python package which helps to debug machine learning classifiers and explain their predictions. ELI5 allows to check weights of sklearn_crfsuite.CRF models." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
From \\ ToOB-artI-artB-eveI-eveB-geoI-geoB-gpeI-gpeB-natI-natB-orgI-orgB-perI-perB-timI-tim
O\n", + " 3.465\n", + " \n", + " 0.477\n", + " \n", + " -2.326\n", + " \n", + " 0.973\n", + " \n", + " -2.034\n", + " \n", + " 0.919\n", + " \n", + " -4.235\n", + " \n", + " 0.506\n", + " \n", + " -1.857\n", + " \n", + " 0.049\n", + " \n", + " -1.256\n", + " \n", + " 0.794\n", + " \n", + " -4.544\n", + " \n", + " 2.058\n", + " \n", + " -3.123\n", + " \n", + " 1.417\n", + " \n", + " -4.153\n", + "
B-art\n", + " -0.876\n", + " \n", + " -0.023\n", + " \n", + " 4.951\n", + " \n", + " -0.003\n", + " \n", + " -0.101\n", + " \n", + " -0.373\n", + " \n", + " -0.232\n", + " \n", + " -0.373\n", + " \n", + " -0.251\n", + " \n", + " -0.008\n", + " \n", + " -0.08\n", + " \n", + " 0.606\n", + " \n", + " -0.601\n", + " \n", + " -0.816\n", + " \n", + " -0.784\n", + " \n", + " -0.669\n", + " \n", + " -0.324\n", + "
I-art\n", + " -0.986\n", + " \n", + " -0.279\n", + " \n", + " 4.665\n", + " \n", + " -0.014\n", + " \n", + " -0.086\n", + " \n", + " 0.336\n", + " \n", + " -0.262\n", + " \n", + " -0.272\n", + " \n", + " -0.089\n", + " \n", + " -0.008\n", + " \n", + " -0.066\n", + " \n", + " -0.44\n", + " \n", + " -0.52\n", + " \n", + " -0.747\n", + " \n", + " -0.563\n", + " \n", + " 0.093\n", + " \n", + " -0.399\n", + "
B-eve\n", + " -0.533\n", + " \n", + " -0.006\n", + " \n", + " -0.077\n", + " \n", + " -0.022\n", + " \n", + " 4.847\n", + " \n", + " -0.234\n", + " \n", + " -0.219\n", + " \n", + " -0.328\n", + " \n", + " -0.177\n", + " \n", + " 0.0\n", + " \n", + " -0.04\n", + " \n", + " -0.479\n", + " \n", + " -0.504\n", + " \n", + " -0.844\n", + " \n", + " -0.409\n", + " \n", + " -0.656\n", + " \n", + " -0.515\n", + "
I-eve\n", + " -0.333\n", + " \n", + " 0.0\n", + " \n", + " -0.034\n", + " \n", + " -0.653\n", + " \n", + " 3.968\n", + " \n", + " -0.257\n", + " \n", + " -0.193\n", + " \n", + " -0.105\n", + " \n", + " -0.059\n", + " \n", + " -0.01\n", + " \n", + " -0.009\n", + " \n", + " -0.233\n", + " \n", + " -0.272\n", + " \n", + " -0.351\n", + " \n", + " -0.387\n", + " \n", + " -0.384\n", + " \n", + " -0.177\n", + "
B-geo\n", + " 0.216\n", + " \n", + " 1.413\n", + " \n", + " -1.024\n", + " \n", + " -0.136\n", + " \n", + " -0.695\n", + " \n", + " -1.541\n", + " \n", + " 6.008\n", + " \n", + " 1.1\n", + " \n", + " -1.881\n", + " \n", + " -0.05\n", + " \n", + " -0.502\n", + " \n", + " -1.03\n", + " \n", + " -1.947\n", + " \n", + " -0.966\n", + " \n", + " -1.813\n", + " \n", + " 1.688\n", + " \n", + " -1.373\n", + "
I-geo\n", + " -0.034\n", + " \n", + " -0.048\n", + " \n", + " -0.417\n", + " \n", + " -0.029\n", + " \n", + " -0.256\n", + " \n", + " -1.011\n", + " \n", + " 5.296\n", + " \n", + " -0.468\n", + " \n", + " -0.719\n", + " \n", + " -0.009\n", + " \n", + " -0.147\n", + " \n", + " -0.786\n", + " \n", + " -1.018\n", + " \n", + " -0.791\n", + " \n", + " -0.642\n", + " \n", + " 1.238\n", + " \n", + " -0.928\n", + "
B-gpe\n", + " 0.62\n", + " \n", + " -0.255\n", + " \n", + " -0.858\n", + " \n", + " -0.278\n", + " \n", + " -0.661\n", + " \n", + " -0.184\n", + " \n", + " -2.152\n", + " \n", + " -3.169\n", + " \n", + " 3.92\n", + " \n", + " -0.049\n", + " \n", + " -0.296\n", + " \n", + " 0.951\n", + " \n", + " -1.848\n", + " \n", + " 0.572\n", + " \n", + " -1.357\n", + " \n", + " -0.347\n", + " \n", + " -0.987\n", + "
I-gpe\n", + " -0.656\n", + " \n", + " -0.163\n", + " \n", + " -0.082\n", + " \n", + " -0.01\n", + " \n", + " -0.031\n", + " \n", + " -0.007\n", + " \n", + " -0.61\n", + " \n", + " -0.624\n", + " \n", + " 3.97\n", + " \n", + " 0.0\n", + " \n", + " -0.024\n", + " \n", + " -0.377\n", + " \n", + " -0.622\n", + " \n", + " -0.619\n", + " \n", + " -0.441\n", + " \n", + " -0.684\n", + " \n", + " -0.247\n", + "
B-nat\n", + " -0.405\n", + " \n", + " -0.001\n", + " \n", + " -0.055\n", + " \n", + " 0.0\n", + " \n", + " -0.042\n", + " \n", + " -0.254\n", + " \n", + " -0.109\n", + " \n", + " -0.182\n", + " \n", + " -0.068\n", + " \n", + " -0.005\n", + " \n", + " 3.208\n", + " \n", + " -0.255\n", + " \n", + " -0.334\n", + " \n", + " -0.55\n", + " \n", + " -0.394\n", + " \n", + " -0.231\n", + " \n", + " -0.078\n", + "
I-nat\n", + " -0.835\n", + " \n", + " -0.002\n", + " \n", + " -0.037\n", + " \n", + " 0.0\n", + " \n", + " -0.007\n", + " \n", + " -0.18\n", + " \n", + " -0.053\n", + " \n", + " -0.093\n", + " \n", + " -0.026\n", + " \n", + " -0.066\n", + " \n", + " 1.92\n", + " \n", + " -0.133\n", + " \n", + " -0.227\n", + " \n", + " -0.364\n", + " \n", + " -0.231\n", + " \n", + " -0.182\n", + " \n", + " -0.04\n", + "
B-org\n", + " 0.046\n", + " \n", + " 2.002\n", + " \n", + " -1.136\n", + " \n", + " -0.195\n", + " \n", + " -0.816\n", + " \n", + " -0.611\n", + " \n", + " -1.839\n", + " \n", + " -0.26\n", + " \n", + " -1.572\n", + " \n", + " -0.129\n", + " \n", + " -0.703\n", + " \n", + " -2.258\n", + " \n", + " 4.456\n", + " \n", + " -0.771\n", + " \n", + " -2.332\n", + " \n", + " -0.652\n", + " \n", + " -1.306\n", + "
I-org\n", + " 0.042\n", + " \n", + " -0.319\n", + " \n", + " -0.961\n", + " \n", + " -0.174\n", + " \n", + " -0.68\n", + " \n", + " -1.657\n", + " \n", + " -1.318\n", + " \n", + " -0.708\n", + " \n", + " -0.912\n", + " \n", + " -0.434\n", + " \n", + " -0.591\n", + " \n", + " -2.177\n", + " \n", + " 4.321\n", + " \n", + " -0.133\n", + " \n", + " -2.456\n", + " \n", + " 0.119\n", + " \n", + " -1.327\n", + "
B-per\n", + " 0.016\n", + " \n", + " -0.302\n", + " \n", + " -0.773\n", + " \n", + " -0.174\n", + " \n", + " -0.758\n", + " \n", + " 0.028\n", + " \n", + " -1.0\n", + " \n", + " 0.617\n", + " \n", + " -1.042\n", + " \n", + " -0.095\n", + " \n", + " -0.668\n", + " \n", + " 0.918\n", + " \n", + " -1.698\n", + " \n", + " -4.279\n", + " \n", + " 4.712\n", + " \n", + " -0.386\n", + " \n", + " -0.846\n", + "
I-per\n", + " -0.223\n", + " \n", + " -0.169\n", + " \n", + " -0.683\n", + " \n", + " -0.278\n", + " \n", + " -0.747\n", + " \n", + " -1.268\n", + " \n", + " -1.189\n", + " \n", + " -0.71\n", + " \n", + " -0.974\n", + " \n", + " -0.078\n", + " \n", + " -0.593\n", + " \n", + " -1.132\n", + " \n", + " -1.89\n", + " \n", + " -3.095\n", + " \n", + " 4.04\n", + " \n", + " 0.177\n", + " \n", + " -1.16\n", + "
B-tim\n", + " 0.311\n", + " \n", + " -0.451\n", + " \n", + " -0.4\n", + " \n", + " -0.059\n", + " \n", + " -0.557\n", + " \n", + " -0.759\n", + " \n", + " -0.991\n", + " \n", + " -1.165\n", + " \n", + " -0.447\n", + " \n", + " 0.611\n", + " \n", + " -0.252\n", + " \n", + " -0.629\n", + " \n", + " -1.229\n", + " \n", + " -1.309\n", + " \n", + " -0.897\n", + " \n", + " -2.448\n", + " \n", + " 4.575\n", + "
I-tim\n", + " 0.145\n", + " \n", + " -0.144\n", + " \n", + " -0.142\n", + " \n", + " -0.356\n", + " \n", + " -0.15\n", + " \n", + " 0.62\n", + " \n", + " -0.291\n", + " \n", + " 0.037\n", + " \n", + " -0.067\n", + " \n", + " -0.064\n", + " \n", + " -0.017\n", + " \n", + " -0.812\n", + " \n", + " -0.74\n", + " \n", + " -0.006\n", + " \n", + " -0.224\n", + " \n", + " -1.571\n", + " \n", + " 4.789\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " y=O\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-art\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-art\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-eve\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-eve\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-geo\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-geo\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-gpe\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-gpe\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-nat\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-nat\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-org\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-org\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-per\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-per\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-tim\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-tim\n", + " \n", + "\n", + "\n", + "top features\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +4.699\n", + " \n", + " BOS\n", + "
\n", + " +3.762\n", + " \n", + " bias\n", + "
\n", + " +3.593\n", + " \n", + " word.lower():kurdish\n", + "
\n", + " +3.585\n", + " \n", + " word.lower():jewish\n", + "
\n", + " +3.226\n", + " \n", + " word[-2:]:N1\n", + "
\n", + " +3.118\n", + " \n", + " postag[:2]:VB\n", + "
\n", + " … 6280 more positive …\n", + "
\n", + " … 1789 more negative …\n", + "
\n", + " -3.088\n", + " \n", + " word.isdigit()\n", + "
\n", + " -3.234\n", + " \n", + " word.istitle()\n", + "
\n", + " -3.521\n", + " \n", + " postag:NNP\n", + "
\n", + " -3.895\n", + " \n", + " word[-2:]:0s\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +2.250\n", + " \n", + " word.lower():twitter\n", + "
\n", + " +2.203\n", + " \n", + " word.lower():english\n", + "
\n", + " +2.038\n", + " \n", + " -1:word.lower():tamilnet\n", + "
\n", + " +1.649\n", + " \n", + " word.lower():dodge\n", + "
\n", + " +1.623\n", + " \n", + " word.lower():jeep\n", + "
\n", + " +1.562\n", + " \n", + " -1:word.lower():newspaper\n", + "
\n", + " +1.536\n", + " \n", + " -1:word.lower():unlike\n", + "
\n", + " +1.528\n", + " \n", + " word[-3:]:eep\n", + "
\n", + " +1.515\n", + " \n", + " word[-2:]:ep\n", + "
\n", + " +1.511\n", + " \n", + " -1:word.lower():either\n", + "
\n", + " … 212 more positive …\n", + "
\n", + " … 22 more negative …\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +1.221\n", + " \n", + " word.lower():constitution\n", + "
\n", + " +1.220\n", + " \n", + " +1:word.lower():airport\n", + "
\n", + " +1.076\n", + " \n", + " -1:word.lower():magazine\n", + "
\n", + " +1.075\n", + " \n", + " word[-2:]:Us\n", + "
\n", + " +1.075\n", + " \n", + " word[-3:]:Us\n", + "
\n", + " +1.048\n", + " \n", + " word.lower():us\n", + "
\n", + " +1.028\n", + " \n", + " +1:word.lower():newspaper\n", + "
\n", + " +0.988\n", + " \n", + " word[-2:]:le\n", + "
\n", + " +0.961\n", + " \n", + " +1:word.lower():would\n", + "
\n", + " +0.951\n", + " \n", + " word.lower():simple\n", + "
\n", + " … 192 more positive …\n", + "
\n", + " … 12 more negative …\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +2.018\n", + " \n", + " -1:word.lower():war\n", + "
\n", + " +1.604\n", + " \n", + " -1:word.lower():first\n", + "
\n", + " +1.530\n", + " \n", + " -1:word.lower():celebrated\n", + "
\n", + " +1.523\n", + " \n", + " word.lower():christmas\n", + "
\n", + " +1.456\n", + " \n", + " +1:word.lower():get\n", + "
\n", + " +1.429\n", + " \n", + " word.lower():games\n", + "
\n", + " +1.366\n", + " \n", + " word[-3:]:mas\n", + "
\n", + " +1.347\n", + " \n", + " word.lower():ii\n", + "
\n", + " +1.347\n", + " \n", + " word[-3:]:II\n", + "
\n", + " +1.344\n", + " \n", + " word[-2:]:II\n", + "
\n", + " … 108 more positive …\n", + "
\n", + " … 17 more negative …\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +1.133\n", + " \n", + " word.lower():peace\n", + "
\n", + " +1.132\n", + " \n", + " postag:NNPS\n", + "
\n", + " +1.122\n", + " \n", + " -1:word.lower():korean\n", + "
\n", + " +1.111\n", + " \n", + " word[-2:]:up\n", + "
\n", + " +1.030\n", + " \n", + " word.lower():cup\n", + "
\n", + " +1.030\n", + " \n", + " word[-3:]:Cup\n", + "
\n", + " +1.018\n", + " \n", + " word[-3:]:ace\n", + "
\n", + " +0.991\n", + " \n", + " word[-3:]:pen\n", + "
\n", + " +0.988\n", + " \n", + " word.lower():open\n", + "
\n", + " +0.979\n", + " \n", + " word[-2:]:rs\n", + "
\n", + " … 106 more positive …\n", + "
\n", + " … 13 more negative …\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +2.980\n", + " \n", + " word.lower():mid-september\n", + "
\n", + " +2.929\n", + " \n", + " -1:word.lower():serb\n", + "
\n", + " +2.882\n", + " \n", + " word.lower():aswat\n", + "
\n", + " +2.623\n", + " \n", + " word.lower():washington\n", + "
\n", + " +2.616\n", + " \n", + " word.lower():china\n", + "
\n", + " +2.531\n", + " \n", + " +1:word.lower():palestinian\n", + "
\n", + " +2.503\n", + " \n", + " word.lower():zahedan\n", + "
\n", + " +2.451\n", + " \n", + " word.lower():beijing\n", + "
\n", + " +2.441\n", + " \n", + " word[-3:]:the\n", + "
\n", + " … 1699 more positive …\n", + "
\n", + " … 275 more negative …\n", + "
\n", + " -2.527\n", + " \n", + " -1:word.lower():recognize\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +2.998\n", + " \n", + " +1:word.lower():town\n", + "
\n", + " +2.924\n", + " \n", + " +1:word.lower():achieved\n", + "
\n", + " +2.472\n", + " \n", + " +1:word.lower():block\n", + "
\n", + " +2.221\n", + " \n", + " +1:word.lower():produced\n", + "
\n", + " +2.129\n", + " \n", + " -1:word.lower():tulkarem\n", + "
\n", + " +2.113\n", + " \n", + " word.lower():settlement\n", + "
\n", + " +1.879\n", + " \n", + " -1:word.lower():western\n", + "
\n", + " +1.805\n", + " \n", + " +1:word.lower():regional\n", + "
\n", + " +1.781\n", + " \n", + " +1:word.lower():about\n", + "
\n", + " +1.779\n", + " \n", + " -1:word.lower():eastern\n", + "
\n", + " … 726 more positive …\n", + "
\n", + " … 109 more negative …\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +3.050\n", + " \n", + " word.istitle()\n", + "
\n", + " +3.024\n", + " \n", + " word.lower():nepal\n", + "
\n", + " +3.003\n", + " \n", + " word[-3:]:pal\n", + "
\n", + " +2.784\n", + " \n", + " +1:word.lower():mayor\n", + "
\n", + " +2.632\n", + " \n", + " -1:word.lower():behind\n", + "
\n", + " +2.612\n", + " \n", + " postag:NNS\n", + "
\n", + " +2.587\n", + " \n", + " +1:word.lower():representative\n", + "
\n", + " +2.584\n", + " \n", + " word[-3:]:ans\n", + "
\n", + " +2.560\n", + " \n", + " +1:word.lower():unemployment\n", + "
\n", + " +2.502\n", + " \n", + " +1:word.lower():if\n", + "
\n", + " … 876 more positive …\n", + "
\n", + " … 188 more negative …\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +2.781\n", + " \n", + " +1:word.lower():began\n", + "
\n", + " +2.258\n", + " \n", + " -1:word.lower():soviet\n", + "
\n", + " +1.976\n", + " \n", + " +1:word.lower():health\n", + "
\n", + " +1.876\n", + " \n", + " +1:word.lower():returned\n", + "
\n", + " +1.786\n", + " \n", + " +1:word.lower():that\n", + "
\n", + " +1.785\n", + " \n", + " word[-3:]:can\n", + "
\n", + " +1.519\n", + " \n", + " +1:word.istitle()\n", + "
\n", + " +1.473\n", + " \n", + " -1:word.lower():bosnian\n", + "
\n", + " +1.465\n", + " \n", + " -1:word.lower():democratic\n", + "
\n", + " +1.394\n", + " \n", + " +1:word.lower():countries\n", + "
\n", + " … 118 more positive …\n", + "
\n", + " … 18 more negative …\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +1.646\n", + " \n", + " word.isupper()\n", + "
\n", + " +1.553\n", + " \n", + " word.lower():h5n1\n", + "
\n", + " +1.553\n", + " \n", + " word[-3:]:5N1\n", + "
\n", + " +1.532\n", + " \n", + " +1:word.lower():toll\n", + "
\n", + " +1.522\n", + " \n", + " word.lower():katrina\n", + "
\n", + " +1.492\n", + " \n", + " word[-2:]:N1\n", + "
\n", + " +1.449\n", + " \n", + " word.lower():marburg\n", + "
\n", + " +1.409\n", + " \n", + " word[-3:]:urg\n", + "
\n", + " +1.408\n", + " \n", + " +1:word.lower():form\n", + "
\n", + " +1.398\n", + " \n", + " +1:word.lower():katrina\n", + "
\n", + " … 71 more positive …\n", + "
\n", + " … 11 more negative …\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +1.275\n", + " \n", + " -1:word.lower():hurricane\n", + "
\n", + " +1.275\n", + " \n", + " word.lower():katrina\n", + "
\n", + " +1.218\n", + " \n", + " +1:word.lower():outbreak\n", + "
\n", + " +1.070\n", + " \n", + " word[-3:]:ina\n", + "
\n", + " +1.055\n", + " \n", + " word[-2:]:na\n", + "
\n", + " +0.841\n", + " \n", + " -1:word.lower():jing\n", + "
\n", + " +0.826\n", + " \n", + " word.lower():jing\n", + "
\n", + " +0.795\n", + " \n", + " -1:postag:NNP\n", + "
\n", + " +0.763\n", + " \n", + " word[-2:]:me\n", + "
\n", + " +0.726\n", + " \n", + " -1:word.istitle()\n", + "
\n", + " … 29 more positive …\n", + "
\n", + " … 11 more negative …\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +3.339\n", + " \n", + " word.lower():al-qaida\n", + "
\n", + " +3.091\n", + " \n", + " -1:word.lower():telephoned\n", + "
\n", + " +3.081\n", + " \n", + " word.lower():hamas\n", + "
\n", + " +2.999\n", + " \n", + " +1:word.lower():fought\n", + "
\n", + " +2.899\n", + " \n", + " -1:word.lower():brunei\n", + "
\n", + " +2.874\n", + " \n", + " word.lower():parliament\n", + "
\n", + " +2.750\n", + " \n", + " word[-3:]:ban\n", + "
\n", + " +2.677\n", + " \n", + " +1:word.lower():recognizes\n", + "
\n", + " +2.663\n", + " \n", + " +1:word.lower():influence\n", + "
\n", + " +2.640\n", + " \n", + " +1:word.lower():assistant\n", + "
\n", + " … 1499 more positive …\n", + "
\n", + " … 255 more negative …\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +2.207\n", + " \n", + " +1:word.lower():mr.\n", + "
\n", + " +1.890\n", + " \n", + " -1:word.lower():mediterranean\n", + "
\n", + " +1.766\n", + " \n", + " +1:word.lower():will\n", + "
\n", + " +1.703\n", + " \n", + " +1:word.lower():yorker\n", + "
\n", + " +1.692\n", + " \n", + " word.lower():ministry\n", + "
\n", + " +1.685\n", + " \n", + " -1:word.lower():munich\n", + "
\n", + " +1.656\n", + " \n", + " word[-3:]:ate\n", + "
\n", + " +1.621\n", + " \n", + " +1:word.lower():in\n", + "
\n", + " +1.580\n", + " \n", + " word.lower():nations\n", + "
\n", + " … 1511 more positive …\n", + "
\n", + " … 207 more negative …\n", + "
\n", + " -1.769\n", + " \n", + " word[-3:]:ary\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +3.371\n", + " \n", + " word.lower():president\n", + "
\n", + " +2.995\n", + " \n", + " word.lower():obama\n", + "
\n", + " +2.921\n", + " \n", + " BOS\n", + "
\n", + " +2.818\n", + " \n", + " word.lower():jupiter\n", + "
\n", + " +2.791\n", + " \n", + " word.lower():prime\n", + "
\n", + " +2.687\n", + " \n", + " word.lower():gotovina\n", + "
\n", + " +2.661\n", + " \n", + " +1:word.lower():vladimir\n", + "
\n", + " +2.563\n", + " \n", + " word.lower():bolton\n", + "
\n", + " +2.525\n", + " \n", + " -1:word.lower():under\n", + "
\n", + " +2.415\n", + " \n", + " +1:word.lower():president\n", + "
\n", + " … 1513 more positive …\n", + "
\n", + " … 288 more negative …\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +1.742\n", + " \n", + " +1:word.lower():david\n", + "
\n", + " +1.518\n", + " \n", + " -1:postag:NN\n", + "
\n", + " +1.455\n", + " \n", + " +1:word.lower():saad\n", + "
\n", + " +1.335\n", + " \n", + " -1:word.lower():masjid\n", + "
\n", + " +1.283\n", + " \n", + " word[-3:]:aad\n", + "
\n", + " … 1936 more positive …\n", + "
\n", + " … 263 more negative …\n", + "
\n", + " -1.300\n", + " \n", + " +1:word.lower():on\n", + "
\n", + " -1.310\n", + " \n", + " -1:word.lower():sri\n", + "
\n", + " -1.454\n", + " \n", + " word[-2:]:ka\n", + "
\n", + " -1.542\n", + " \n", + " -1:word.lower():vice\n", + "
\n", + " -1.587\n", + " \n", + " word[-3:]:ion\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +5.184\n", + " \n", + " word[-3:]:day\n", + "
\n", + " +3.326\n", + " \n", + " word.lower():thanksgiving\n", + "
\n", + " +3.269\n", + " \n", + " word[-2:]:ay\n", + "
\n", + " +3.172\n", + " \n", + " +1:word.lower():year\n", + "
\n", + " +3.120\n", + " \n", + " word.lower():afternoon\n", + "
\n", + " +3.038\n", + " \n", + " word[-2:]:0s\n", + "
\n", + " +2.915\n", + " \n", + " +1:word.lower():czech\n", + "
\n", + " +2.824\n", + " \n", + " word[-3:]:ber\n", + "
\n", + " +2.819\n", + " \n", + " word.lower():august\n", + "
\n", + " +2.809\n", + " \n", + " +1:word.lower():years\n", + "
\n", + " … 964 more positive …\n", + "
\n", + " … 210 more negative …\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +3.754\n", + " \n", + " word[-3:]:day\n", + "
\n", + " +2.461\n", + " \n", + " word[-2:]:ay\n", + "
\n", + " +2.334\n", + " \n", + " -1:word.lower():ceremonies\n", + "
\n", + " +2.279\n", + " \n", + " +1:word.lower():moscow\n", + "
\n", + " +1.895\n", + " \n", + " -1:word.lower():march\n", + "
\n", + " +1.894\n", + " \n", + " -1:word.lower():anniversary\n", + "
\n", + " +1.853\n", + " \n", + " word.lower():decades\n", + "
\n", + " +1.800\n", + " \n", + " -1:word.lower():june\n", + "
\n", + " +1.769\n", + " \n", + " +1:word.lower():rebel\n", + "
\n", + " +1.715\n", + " \n", + " word.lower():quarter\n", + "
\n", + " … 609 more positive …\n", + "
\n", + " … 100 more negative …\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import eli5\n", + "\n", + "eli5.show_weights(crf, top=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It does make sense that I-entity must follow B-entity, such as I-geo follows B-geo, I-org follows B-org, I-per follows B-per, and so on. \n", + "\n", + "We can also see that it is not common in this dataset to have a person right after an organization name (B-org -> I-per has a large negative weight)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we regularize CRF more, we can expect that only features which are generic will remain, and memoized tokens will go. Let’s check what effect does regularization have on CRF weights:" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
From \\ ToOB-artI-artB-eveI-eveB-geoI-geoB-gpeI-gpeB-natI-natB-orgI-orgB-perI-perB-timI-tim
O\n", + " 1.782\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 1.456\n", + " \n", + " 0.0\n", + " \n", + " 0.303\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.791\n", + " \n", + " 0.0\n", + " \n", + " 0.062\n", + " \n", + " 0.0\n", + " \n", + " 1.709\n", + " \n", + " 0.0\n", + "
B-art\n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
I-art\n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
B-eve\n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
I-eve\n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
B-geo\n", + " 0.23\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 2.704\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
I-geo\n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
B-gpe\n", + " 0.01\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
I-gpe\n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
B-nat\n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
I-nat\n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
B-org\n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 2.811\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
I-org\n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 2.59\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
B-per\n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 3.474\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
I-per\n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 1.473\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
B-tim\n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 1.744\n", + "
I-tim\n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " y=O\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-art\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-art\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-eve\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-eve\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-geo\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-geo\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-gpe\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-gpe\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-nat\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-nat\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-org\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-org\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-per\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-per\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-tim\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-tim\n", + " \n", + "\n", + "\n", + "top features\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +3.691\n", + " \n", + " bias\n", + "
\n", + " +1.774\n", + " \n", + " BOS\n", + "
\n", + " +0.984\n", + " \n", + " -1:postag[:2]:NN\n", + "
\n", + " +0.435\n", + " \n", + " postag[:2]:VB\n", + "
\n", + " +0.218\n", + " \n", + " EOS\n", + "
\n", + " … 10 more positive …\n", + "
\n", + " … 1 more negative …\n", + "
\n", + " -0.467\n", + " \n", + " postag:CD\n", + "
\n", + " -0.467\n", + " \n", + " postag[:2]:CD\n", + "
\n", + " -1.472\n", + " \n", + " word.isdigit()\n", + "
\n", + " -2.430\n", + " \n", + " word.istitle()\n", + "
\n", + " -3.320\n", + " \n", + " postag:NNP\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +1.052\n", + " \n", + " postag:NNP\n", + "
\n", + " +0.534\n", + " \n", + " word.istitle()\n", + "
\n", + " +0.218\n", + " \n", + " -1:postag:IN\n", + "
\n", + " +0.218\n", + " \n", + " -1:postag[:2]:IN\n", + "
\n", + " +0.125\n", + " \n", + " -1:word.lower():in\n", + "
\n", + " -0.289\n", + " \n", + " -1:postag[:2]:NN\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +0.267\n", + " \n", + " -1:postag:NNP\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +1.533\n", + " \n", + " postag:JJ\n", + "
\n", + " +1.506\n", + " \n", + " postag[:2]:JJ\n", + "
\n", + " +1.139\n", + " \n", + " word.istitle()\n", + "
\n", + " +0.549\n", + " \n", + " word[-2:]:an\n", + "
\n", + " -0.033\n", + " \n", + " postag:NNP\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +0.806\n", + " \n", + " postag:NNP\n", + "
\n", + " +0.565\n", + " \n", + " postag[:2]:NN\n", + "
\n", + " +0.230\n", + " \n", + " -1:postag[:2]:DT\n", + "
\n", + " +0.230\n", + " \n", + " -1:postag:DT\n", + "
\n", + " +0.004\n", + " \n", + " word.isupper()\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +0.496\n", + " \n", + " -1:postag:NNP\n", + "
\n", + " +0.377\n", + " \n", + " -1:word.istitle()\n", + "
\n", + " +0.225\n", + " \n", + " -1:postag[:2]:NN\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +0.510\n", + " \n", + " postag:NNP\n", + "
\n", + " +0.438\n", + " \n", + " +1:postag:NNP\n", + "
\n", + " +0.308\n", + " \n", + " +1:word.istitle()\n", + "
\n", + " +0.075\n", + " \n", + " postag[:2]:NN\n", + "
\n", + " +0.022\n", + " \n", + " word.istitle()\n", + "
\n", + " +0.002\n", + " \n", + " +1:postag[:2]:NN\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +0.881\n", + " \n", + " -1:postag:NNP\n", + "
\n", + " +0.480\n", + " \n", + " -1:postag[:2]:NN\n", + "
\n", + " +0.404\n", + " \n", + " -1:word.istitle()\n", + "
\n", + " +0.196\n", + " \n", + " postag:NNP\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +1.740\n", + " \n", + " word[-2:]:ay\n", + "
\n", + " +1.657\n", + " \n", + " word[-3:]:day\n", + "
\n", + " +0.204\n", + " \n", + " postag[:2]:CD\n", + "
\n", + " +0.204\n", + " \n", + " postag:CD\n", + "
\n", + " +0.096\n", + " \n", + " bias\n", + "
\n", + " +0.033\n", + " \n", + " -1:postag[:2]:IN\n", + "
\n", + " +0.033\n", + " \n", + " -1:postag:IN\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "crf = sklearn_crfsuite.CRF(\n", + " algorithm='lbfgs',\n", + " c1=200,\n", + " c2=0.1,\n", + " max_iterations=20,\n", + " all_possible_transitions=False,\n", + ")\n", + "crf.fit(X_train, y_train)\n", + "eli5.show_weights(crf, top=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
From \\ ToOB-artI-artB-eveI-eveB-geoI-geoB-gpeI-gpeB-natI-natB-orgI-orgB-perI-perB-timI-tim
O\n", + " 3.561\n", + " \n", + " 0.747\n", + " \n", + " -2.344\n", + " \n", + " 0.928\n", + " \n", + " -1.847\n", + " \n", + " 1.1\n", + " \n", + " -4.541\n", + " \n", + " 0.523\n", + " \n", + " -1.684\n", + " \n", + " 0.239\n", + " \n", + " -1.025\n", + " \n", + " 0.795\n", + " \n", + " -4.779\n", + " \n", + " 1.64\n", + " \n", + " -3.017\n", + " \n", + " 1.39\n", + " \n", + " -4.202\n", + "
B-art\n", + " -0.556\n", + " \n", + " 0.0\n", + " \n", + " 5.467\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " -0.021\n", + " \n", + " -0.085\n", + " \n", + " -0.145\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.478\n", + " \n", + " -0.328\n", + " \n", + " -0.735\n", + " \n", + " -0.394\n", + " \n", + " -0.41\n", + " \n", + " 0.0\n", + "
I-art\n", + " -0.74\n", + " \n", + " 0.0\n", + " \n", + " 5.438\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.509\n", + " \n", + " -0.075\n", + " \n", + " -0.079\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " -0.091\n", + " \n", + " -0.315\n", + " \n", + " -0.777\n", + " \n", + " -0.246\n", + " \n", + " 0.19\n", + " \n", + " -0.062\n", + "
B-eve\n", + " -0.213\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 5.415\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " -0.12\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " -0.123\n", + " \n", + " -0.319\n", + " \n", + " -0.662\n", + " \n", + " -0.397\n", + " \n", + " -0.512\n", + " \n", + " -0.193\n", + "
I-eve\n", + " -0.239\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " -0.279\n", + " \n", + " 4.838\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " -0.013\n", + " \n", + " -0.215\n", + " \n", + " -0.002\n", + " \n", + " -0.236\n", + " \n", + " 0.0\n", + "
B-geo\n", + " 0.31\n", + " \n", + " 1.357\n", + " \n", + " -0.737\n", + " \n", + " 0.0\n", + " \n", + " -0.498\n", + " \n", + " -1.395\n", + " \n", + " 5.227\n", + " \n", + " 0.733\n", + " \n", + " -1.392\n", + " \n", + " 0.0\n", + " \n", + " -0.233\n", + " \n", + " -0.973\n", + " \n", + " -1.993\n", + " \n", + " -1.367\n", + " \n", + " -1.873\n", + " \n", + " 1.648\n", + " \n", + " -1.25\n", + "
I-geo\n", + " -0.01\n", + " \n", + " 0.0\n", + " \n", + " -0.123\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " -0.88\n", + " \n", + " 4.457\n", + " \n", + " -0.486\n", + " \n", + " -0.606\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " -0.766\n", + " \n", + " -0.944\n", + " \n", + " -1.214\n", + " \n", + " -0.579\n", + " \n", + " 1.036\n", + " \n", + " -0.96\n", + "
B-gpe\n", + " 0.623\n", + " \n", + " 0.0\n", + " \n", + " -0.581\n", + " \n", + " -0.029\n", + " \n", + " -0.427\n", + " \n", + " -0.116\n", + " \n", + " -2.327\n", + " \n", + " -3.359\n", + " \n", + " 4.986\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.901\n", + " \n", + " -1.945\n", + " \n", + " 0.294\n", + " \n", + " -1.231\n", + " \n", + " -0.431\n", + " \n", + " -0.876\n", + "
I-gpe\n", + " -0.21\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.24\n", + " \n", + " -0.291\n", + " \n", + " -0.233\n", + " \n", + " 4.878\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " -0.101\n", + " \n", + " -0.352\n", + " \n", + " -0.385\n", + " \n", + " -0.263\n", + " \n", + " -0.379\n", + " \n", + " 0.0\n", + "
B-nat\n", + " -0.402\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 3.694\n", + " \n", + " 0.0\n", + " \n", + " -0.016\n", + " \n", + " -0.48\n", + " \n", + " -0.194\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
I-nat\n", + " -0.771\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 2.45\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " -0.167\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + "
B-org\n", + " 0.086\n", + " \n", + " 2.334\n", + " \n", + " -1.057\n", + " \n", + " 0.0\n", + " \n", + " -0.601\n", + " \n", + " -0.496\n", + " \n", + " -2.027\n", + " \n", + " -0.234\n", + " \n", + " -0.774\n", + " \n", + " 0.0\n", + " \n", + " -0.446\n", + " \n", + " -2.285\n", + " \n", + " 4.476\n", + " \n", + " -0.94\n", + " \n", + " -2.309\n", + " \n", + " -0.679\n", + " \n", + " -1.217\n", + "
I-org\n", + " -0.001\n", + " \n", + " -0.086\n", + " \n", + " -0.801\n", + " \n", + " 0.0\n", + " \n", + " -0.451\n", + " \n", + " -1.59\n", + " \n", + " -1.377\n", + " \n", + " -0.726\n", + " \n", + " -0.608\n", + " \n", + " -0.207\n", + " \n", + " -0.373\n", + " \n", + " -2.271\n", + " \n", + " 4.299\n", + " \n", + " -0.503\n", + " \n", + " -2.332\n", + " \n", + " 0.003\n", + " \n", + " -1.35\n", + "
B-per\n", + " 0.254\n", + " \n", + " 0.0\n", + " \n", + " -0.645\n", + " \n", + " 0.0\n", + " \n", + " -0.393\n", + " \n", + " 0.188\n", + " \n", + " -1.259\n", + " \n", + " 0.292\n", + " \n", + " -0.284\n", + " \n", + " 0.0\n", + " \n", + " -0.455\n", + " \n", + " 1.009\n", + " \n", + " -1.565\n", + " \n", + " -4.509\n", + " \n", + " 5.042\n", + " \n", + " -0.393\n", + " \n", + " -0.676\n", + "
I-per\n", + " -0.066\n", + " \n", + " 0.0\n", + " \n", + " -0.68\n", + " \n", + " -0.037\n", + " \n", + " -0.596\n", + " \n", + " -1.35\n", + " \n", + " -1.487\n", + " \n", + " -0.733\n", + " \n", + " -0.455\n", + " \n", + " 0.0\n", + " \n", + " -0.407\n", + " \n", + " -1.147\n", + " \n", + " -1.968\n", + " \n", + " -3.453\n", + " \n", + " 3.898\n", + " \n", + " 0.0\n", + " \n", + " -1.114\n", + "
B-tim\n", + " 0.428\n", + " \n", + " -0.122\n", + " \n", + " -0.14\n", + " \n", + " 0.0\n", + " \n", + " -0.3\n", + " \n", + " -0.528\n", + " \n", + " -1.005\n", + " \n", + " -1.195\n", + " \n", + " -0.021\n", + " \n", + " 0.559\n", + " \n", + " 0.0\n", + " \n", + " -0.534\n", + " \n", + " -1.379\n", + " \n", + " -1.455\n", + " \n", + " -0.621\n", + " \n", + " -2.518\n", + " \n", + " 4.802\n", + "
I-tim\n", + " 0.322\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.693\n", + " \n", + " -0.136\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " -0.609\n", + " \n", + " -0.621\n", + " \n", + " 0.0\n", + " \n", + " 0.0\n", + " \n", + " -1.51\n", + " \n", + " 5.095\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "crf = sklearn_crfsuite.CRF(\n", + " algorithm='lbfgs',\n", + " c1=0.1,\n", + " c2=0.1,\n", + " max_iterations=100,\n", + " all_possible_transitions=True,\n", + ")\n", + "crf.fit(X_train, y_train);\n", + "eli5.show_weights(crf, top=5, show=['transition_features'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The model learned large negative weights for impossible transitions like O -> I-geo, O -> I-org and O -> I-tim, and so on." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to easy to read, we can check only a subset of tags." + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
From \\ ToOB-orgI-per
O\n", + " 3.561\n", + " \n", + " 0.795\n", + " \n", + " -3.017\n", + "
B-org\n", + " 0.086\n", + " \n", + " -2.285\n", + " \n", + " -2.309\n", + "
I-per\n", + " -0.066\n", + " \n", + " -1.147\n", + " \n", + " 3.898\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " y=O\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=B-org\n", + " \n", + "\n", + "\n", + "top features\n", + " \n", + " \n", + " \n", + " y=I-per\n", + " \n", + "\n", + "\n", + "top features\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +4.720\n", + " \n", + " BOS\n", + "
\n", + " +4.236\n", + " \n", + " bias\n", + "
\n", + " +4.046\n", + " \n", + " word.lower():jewish\n", + "
\n", + " +3.494\n", + " \n", + " word.lower():kurdish\n", + "
\n", + " +3.435\n", + " \n", + " word[-2:]:N1\n", + "
\n", + " +3.031\n", + " \n", + " +1:word.lower():minister\n", + "
\n", + " … 1685 more positive …\n", + "
\n", + " … 983 more negative …\n", + "
\n", + " -3.080\n", + " \n", + " +1:word.lower():last\n", + "
\n", + " -3.387\n", + " \n", + " word.istitle()\n", + "
\n", + " -4.194\n", + " \n", + " postag:NNP\n", + "
\n", + " -4.325\n", + " \n", + " word[-2:]:0s\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +3.603\n", + " \n", + " word.lower():al-qaida\n", + "
\n", + " +3.340\n", + " \n", + " word.lower():hamas\n", + "
\n", + " +3.270\n", + " \n", + " word.lower():parliament\n", + "
\n", + " +3.041\n", + " \n", + " -1:word.lower():telephoned\n", + "
\n", + " +2.991\n", + " \n", + " -1:word.lower():brunei\n", + "
\n", + " +2.966\n", + " \n", + " +1:word.lower():fought\n", + "
\n", + " +2.894\n", + " \n", + " word[-3:]:ban\n", + "
\n", + " +2.754\n", + " \n", + " -1:word.lower():extremist\n", + "
\n", + " +2.654\n", + " \n", + " +1:word.lower():influence\n", + "
\n", + " +2.630\n", + " \n", + " word.lower():westerners\n", + "
\n", + " … 1243 more positive …\n", + "
\n", + " … 169 more negative …\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +1.788\n", + " \n", + " +1:word.lower():david\n", + "
\n", + " +1.709\n", + " \n", + " +1:word.lower():saad\n", + "
\n", + " +1.659\n", + " \n", + " +1:word.lower():reports\n", + "
\n", + " +1.519\n", + " \n", + " +1:word.lower():clinton\n", + "
\n", + " +1.491\n", + " \n", + " -1:postag:NN\n", + "
\n", + " +1.482\n", + " \n", + " word.lower():rice\n", + "
\n", + " +1.385\n", + " \n", + " -1:word.lower():masjid\n", + "
\n", + " … 858 more positive …\n", + "
\n", + " … 182 more negative …\n", + "
\n", + " -1.365\n", + " \n", + " bias\n", + "
\n", + " -1.463\n", + " \n", + " +1:postag:NN\n", + "
\n", + " -1.588\n", + " \n", + " word[-3:]:ion\n", + "
\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eli5.show_weights(crf, top=10, targets=['O', 'B-org', 'I-per'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or check only some of the features for all tags." + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=O\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " -2.186\n", + " \n", + " word.isupper()\n", + "
\n", + " -2.720\n", + " \n", + " word.isdigit()\n", + "
\n", + " -3.387\n", + " \n", + " word.istitle()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=B-art\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +0.151\n", + " \n", + " word.istitle()\n", + "
\n", + " -0.214\n", + " \n", + " word.isupper()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=I-art\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +0.607\n", + " \n", + " word.istitle()\n", + "
\n", + " +0.597\n", + " \n", + " word.isdigit()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=B-eve\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +1.185\n", + " \n", + " word.isupper()\n", + "
\n", + " +0.391\n", + " \n", + " word.isdigit()\n", + "
\n", + " -0.200\n", + " \n", + " word.istitle()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=I-eve\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +0.919\n", + " \n", + " word.isupper()\n", + "
\n", + " +0.069\n", + " \n", + " word.istitle()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=B-geo\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +1.264\n", + " \n", + " word.istitle()\n", + "
\n", + " -0.046\n", + " \n", + " word.isupper()\n", + "
\n", + " -0.728\n", + " \n", + " word.isdigit()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=I-geo\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +0.726\n", + " \n", + " word.istitle()\n", + "
\n", + " +0.534\n", + " \n", + " word.isdigit()\n", + "
\n", + " -0.000\n", + " \n", + " word.isupper()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=B-gpe\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +2.970\n", + " \n", + " word.istitle()\n", + "
\n", + " +1.333\n", + " \n", + " word.isupper()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=I-gpe\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +0.210\n", + " \n", + " word.istitle()\n", + "
\n", + " -0.167\n", + " \n", + " word.isupper()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=B-nat\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +1.622\n", + " \n", + " word.isupper()\n", + "
\n", + " -0.252\n", + " \n", + " word.istitle()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=I-nat\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +0.003\n", + " \n", + " word.istitle()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=B-org\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +1.978\n", + " \n", + " word.isupper()\n", + "
\n", + " +0.000\n", + " \n", + " word.istitle()\n", + "
\n", + " -0.804\n", + " \n", + " word.isdigit()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=I-org\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +0.366\n", + " \n", + " word.istitle()\n", + "
\n", + " +0.021\n", + " \n", + " word.isupper()\n", + "
\n", + " -0.443\n", + " \n", + " word.isdigit()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=B-per\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +0.146\n", + " \n", + " word.istitle()\n", + "
\n", + " -0.098\n", + " \n", + " word.isdigit()\n", + "
\n", + " -1.003\n", + " \n", + " word.isupper()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=I-per\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +0.208\n", + " \n", + " word.istitle()\n", + "
\n", + " -0.020\n", + " \n", + " word.isdigit()\n", + "
\n", + " -0.391\n", + " \n", + " word.isupper()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=B-tim\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +2.573\n", + " \n", + " word.isdigit()\n", + "
\n", + " -0.435\n", + " \n", + " word.istitle()\n", + "
\n", + " -1.133\n", + " \n", + " word.isupper()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "

\n", + " \n", + " \n", + " y=I-tim\n", + " \n", + "\n", + "\n", + "top features\n", + "

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + " Weight?\n", + " Feature
\n", + " +1.978\n", + " \n", + " word.isdigit()\n", + "
\n", + " -0.286\n", + " \n", + " word.isupper()\n", + "
\n", + " -1.304\n", + " \n", + " word.istitle()\n", + "
\n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eli5.show_weights(crf, top=10, feature_re='^word\\.is',\n", + " horizontal_layout=False, show=['targets'])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}