From 47c8016e8564f1ccf2a468799150d26e401f1cfc Mon Sep 17 00:00:00 2001 From: Susan Li Date: Tue, 4 Dec 2018 09:31:58 -0500 Subject: [PATCH] Add notebook --- Avito Duplicate Ads Detection.ipynb | 723 ++++++++++++++++++++++++++++ 1 file changed, 723 insertions(+) create mode 100644 Avito Duplicate Ads Detection.ipynb diff --git a/Avito Duplicate Ads Detection.ipynb b/Avito Duplicate Ads Detection.ipynb new file mode 100644 index 0000000..d16a9a0 --- /dev/null +++ b/Avito Duplicate Ads Detection.ipynb @@ -0,0 +1,723 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "import xgboost as xgb\n", + "import random\n", + "from operator import itemgetter\n", + "import zipfile\n", + "from sklearn.metrics import roc_auc_score\n", + "import time\n", + "random.seed(2016)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def create_feature_map(features):\n", + " outfile = open('xgb.fmap', 'w')\n", + " for i, feat in enumerate(features):\n", + " outfile.write('{0}\\t{1}\\tq\\n'.format(i, feat))\n", + " outfile.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def get_importance(gbm, features):\n", + " create_feature_map(features)\n", + " importance = gbm.get_fscore(fmap='xgb.fmap')\n", + " importance = sorted(importance.items(), key=itemgetter(1), reverse=True)\n", + " return importance" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def intersect(a, b):\n", + " return list(set(a) & set(b))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def print_features_importance(imp):\n", + " for i in range(len(imp)):\n", + " print(\"# \" + str(imp[i][1]))\n", + " print('output.remove(\\'' + imp[i][0] + '\\')')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def run_default_test(train, test, features, target, random_state=0):\n", + " eta = 0.1\n", + " max_depth = 5\n", + " subsample = 0.8\n", + " colsample_bytree = 0.8\n", + " start_time = time.time()\n", + " print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))\n", + " params = {\n", + " \"objective\": \"binary:logistic\",\n", + " \"booster\" : \"gbtree\",\n", + " \"eval_metric\": \"auc\",\n", + " \"eta\": eta,\n", + " \"max_depth\": max_depth,\n", + " \"subsample\": subsample,\n", + " \"colsample_bytree\": colsample_bytree,\n", + " \"silent\": 1,\n", + " \"seed\": random_state\n", + " }\n", + " num_boost_round = 260\n", + " early_stopping_rounds = 20\n", + " test_size = 0.1\n", + "\n", + " X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)\n", + " y_train = X_train[target]\n", + " y_valid = X_valid[target]\n", + " dtrain = xgb.DMatrix(X_train[features], y_train)\n", + " dvalid = xgb.DMatrix(X_valid[features], y_valid)\n", + "\n", + " watchlist = [(dtrain, 'train'), (dvalid, 'eval')]\n", + " gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)\n", + "\n", + " print(\"Validating...\")\n", + " check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_ntree_limit)\n", + " score = roc_auc_score(X_valid[target].values, check)\n", + " print('Check error value: {:.6f}'.format(score))\n", + "\n", + " imp = get_importance(gbm, features)\n", + " print('Importance array: ', imp)\n", + "\n", + " print(\"Predict test set...\")\n", + " test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_ntree_limit)\n", + "\n", + " print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))\n", + " return test_prediction.tolist(), score" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "def get_features(train, test):\n", + " trainval = list(train.columns.values)\n", + " testval = list(test.columns.values)\n", + " output = intersect(trainval, testval)\n", + " output.remove('itemID_1')\n", + " output.remove('itemID_2')\n", + " return output" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "def prep_train():\n", + " testing = 0\n", + " start_time = time.time()\n", + "\n", + " types1 = {\n", + " 'itemID_1': np.dtype(int),\n", + " 'itemID_2': np.dtype(int),\n", + " 'isDuplicate': np.dtype(int),\n", + " 'generationMethod': np.dtype(int),\n", + " }\n", + "\n", + " types2 = {\n", + " 'itemID': np.dtype(int),\n", + " 'categoryID': np.dtype(int),\n", + " 'title': np.dtype(str),\n", + " 'description': np.dtype(str),\n", + " 'images_array': np.dtype(str),\n", + " 'attrsJSON': np.dtype(str),\n", + " 'price': np.dtype(float),\n", + " 'locationID': np.dtype(int),\n", + " 'metroID': np.dtype(float),\n", + " 'lat': np.dtype(float),\n", + " 'lon': np.dtype(float),\n", + " }\n", + "\n", + " print(\"Load ItemPairs_train.csv\")\n", + " pairs = pd.read_csv(\"ItemPairs_train.csv\", dtype=types1)\n", + " # Add 'id' column for easy merge\n", + " print(\"Load ItemInfo_train.csv\")\n", + " items = pd.read_csv(\"ItemInfo_train.csv\", dtype=types2)\n", + " items.fillna(-1, inplace=True)\n", + " location = pd.read_csv(\"Location.csv\")\n", + " category = pd.read_csv(\"Category.csv\")\n", + "\n", + " train = pairs\n", + " train = train.drop(['generationMethod'], axis=1)\n", + "\n", + " print('Add text features...')\n", + " items['len_title'] = items['title'].str.len()\n", + " items['len_description'] = items['description'].str.len()\n", + " items['len_attrsJSON'] = items['attrsJSON'].str.len()\n", + "\n", + " print('Merge item 1...')\n", + " item1 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', \n", + " 'len_title', 'len_description', 'len_attrsJSON']]\n", + " item1 = pd.merge(item1, category, how='left', on='categoryID', left_index=True)\n", + " item1 = pd.merge(item1, location, how='left', on='locationID', left_index=True)\n", + "\n", + " item1 = item1.rename(\n", + " columns={\n", + " 'itemID': 'itemID_1',\n", + " 'categoryID': 'categoryID_1',\n", + " 'parentCategoryID': 'parentCategoryID_1',\n", + " 'price': 'price_1',\n", + " 'locationID': 'locationID_1',\n", + " 'regionID': 'regionID_1',\n", + " 'metroID': 'metroID_1',\n", + " 'lat': 'lat_1',\n", + " 'lon': 'lon_1',\n", + " 'len_title': 'len_title_1',\n", + " 'len_description': 'len_description_1',\n", + " 'len_attrsJSON': 'len_attrsJSON_1',\n", + " }\n", + " )\n", + "\n", + " # Add item 1 data\n", + " train = pd.merge(train, item1, how='left', on='itemID_1', left_index=True)\n", + "\n", + " print('Merge item 2...')\n", + " item2 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', \n", + " 'len_title', 'len_description', 'len_attrsJSON']]\n", + " item2 = pd.merge(item2, category, how='left', on='categoryID', left_index=True)\n", + " item2 = pd.merge(item2, location, how='left', on='locationID', left_index=True)\n", + "\n", + " item2 = item2.rename(\n", + " columns={\n", + " 'itemID': 'itemID_2',\n", + " 'categoryID': 'categoryID_2',\n", + " 'parentCategoryID': 'parentCategoryID_2',\n", + " 'price': 'price_2',\n", + " 'locationID': 'locationID_2',\n", + " 'regionID': 'regionID_2',\n", + " 'metroID': 'metroID_2',\n", + " 'lat': 'lat_2',\n", + " 'lon': 'lon_2',\n", + " 'len_title': 'len_title_2',\n", + " 'len_description': 'len_description_2',\n", + " 'len_attrsJSON': 'len_attrsJSON_2'\n", + " }\n", + " )\n", + "\n", + " # Add item 2 data\n", + " train = pd.merge(train, item2, how='left', on='itemID_2', left_index=True)\n", + "\n", + " # Create same arrays\n", + " print('Create same arrays')\n", + " train['price_same'] = np.equal(train['price_1'], train['price_2']).astype(np.int32)\n", + " train['locationID_same'] = np.equal(train['locationID_1'], train['locationID_2']).astype(np.int32)\n", + " train['categoryID_same'] = np.equal(train['categoryID_1'], train['categoryID_2']).astype(np.int32)\n", + " train['regionID_same'] = np.equal(train['regionID_1'], train['regionID_2']).astype(np.int32)\n", + " train['metroID_same'] = np.equal(train['metroID_1'], train['metroID_2']).astype(np.int32)\n", + " train['lat_same'] = np.equal(train['lat_1'], train['lat_2']).astype(np.int32)\n", + " train['lon_same'] = np.equal(train['lon_1'], train['lon_2']).astype(np.int32)\n", + "\n", + " # print(train.describe())\n", + " print('Create train data time: {} seconds'.format(round(time.time() - start_time, 2)))\n", + " return train" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def prep_test():\n", + " start_time = time.time()\n", + "\n", + " types1 = {\n", + " 'itemID_1': np.dtype(int),\n", + " 'itemID_2': np.dtype(int),\n", + " 'id': np.dtype(int),\n", + " }\n", + "\n", + " types2 = {\n", + " 'itemID': np.dtype(int),\n", + " 'categoryID': np.dtype(int),\n", + " 'title': np.dtype(str),\n", + " 'description': np.dtype(str),\n", + " 'images_array': np.dtype(str),\n", + " 'attrsJSON': np.dtype(str),\n", + " 'price': np.dtype(float),\n", + " 'locationID': np.dtype(int),\n", + " 'metroID': np.dtype(float),\n", + " 'lat': np.dtype(float),\n", + " 'lon': np.dtype(float),\n", + " }\n", + "\n", + " print(\"Load ItemPairs_test.csv\")\n", + " pairs = pd.read_csv(\"ItemPairs_test.csv\", dtype=types1)\n", + " print(\"Load ItemInfo_testcsv\")\n", + " items = pd.read_csv(\"ItemInfo_test.csv\", dtype=types2)\n", + " items.fillna(-1, inplace=True)\n", + " location = pd.read_csv(\"Location.csv\")\n", + " category = pd.read_csv(\"Category.csv\")\n", + "\n", + " train = pairs\n", + "\n", + " print('Add text features...')\n", + " items['len_title'] = items['title'].str.len()\n", + " items['len_description'] = items['description'].str.len()\n", + " items['len_attrsJSON'] = items['attrsJSON'].str.len()\n", + " \n", + " print('Merge item 1...')\n", + " item1 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', \n", + " 'len_title', 'len_description', 'len_attrsJSON']]\n", + " item1 = pd.merge(item1, category, how='left', on='categoryID', left_index=True)\n", + " item1 = pd.merge(item1, location, how='left', on='locationID', left_index=True)\n", + "\n", + " item1 = item1.rename(\n", + " columns={\n", + " 'itemID': 'itemID_1',\n", + " 'categoryID': 'categoryID_1',\n", + " 'parentCategoryID': 'parentCategoryID_1',\n", + " 'price': 'price_1',\n", + " 'locationID': 'locationID_1',\n", + " 'regionID': 'regionID_1',\n", + " 'metroID': 'metroID_1',\n", + " 'lat': 'lat_1',\n", + " 'lon': 'lon_1',\n", + " 'len_title': 'len_title_1',\n", + " 'len_description': 'len_description_1',\n", + " 'len_attrsJSON': 'len_attrsJSON_1'\n", + " }\n", + " )\n", + "\n", + " # Add item 1 data\n", + " train = pd.merge(train, item1, how='left', on='itemID_1', left_index=True)\n", + "\n", + " print('Merge item 2...')\n", + " item2 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon',\n", + " 'len_title', 'len_description', 'len_attrsJSON']]\n", + " item2 = pd.merge(item2, category, how='left', on='categoryID', left_index=True)\n", + " item2 = pd.merge(item2, location, how='left', on='locationID', left_index=True)\n", + "\n", + " item2 = item2.rename(\n", + " columns={\n", + " 'itemID': 'itemID_2',\n", + " 'categoryID': 'categoryID_2',\n", + " 'parentCategoryID': 'parentCategoryID_2',\n", + " 'price': 'price_2',\n", + " 'locationID': 'locationID_2',\n", + " 'regionID': 'regionID_2',\n", + " 'metroID': 'metroID_2',\n", + " 'lat': 'lat_2',\n", + " 'lon': 'lon_2',\n", + " 'len_title': 'len_title_2',\n", + " 'len_description': 'len_description_2',\n", + " 'len_attrsJSON': 'len_attrsJSON_2',\n", + " }\n", + " )\n", + "\n", + " # Add item 2 data\n", + " train = pd.merge(train, item2, how='left', on='itemID_2', left_index=True)\n", + "\n", + " # Create same arrays\n", + " print('Create same arrays')\n", + " train['price_same'] = np.equal(train['price_1'], train['price_2']).astype(np.int32)\n", + " train['locationID_same'] = np.equal(train['locationID_1'], train['locationID_2']).astype(np.int32)\n", + " train['categoryID_same'] = np.equal(train['categoryID_1'], train['categoryID_2']).astype(np.int32)\n", + " train['regionID_same'] = np.equal(train['regionID_1'], train['regionID_2']).astype(np.int32)\n", + " train['metroID_same'] = np.equal(train['metroID_1'], train['metroID_2']).astype(np.int32)\n", + " train['lat_same'] = np.equal(train['lat_1'], train['lat_2']).astype(np.int32)\n", + " train['lon_same'] = np.equal(train['lon_1'], train['lon_2']).astype(np.int32)\n", + "\n", + " # print(train.describe())\n", + " print('Create test data time: {} seconds'.format(round(time.time() - start_time, 2)))\n", + " return train" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "def read_test_train():\n", + " train = prep_train()\n", + " test = prep_test()\n", + " train.fillna(-1, inplace=True)\n", + " test.fillna(-1, inplace=True)\n", + " # Get only subset of data\n", + " if 1:\n", + " len_old = len(train.index)\n", + " train = train.sample(frac=0.5)\n", + " len_new = len(train.index)\n", + " print('Reduce train from {} to {}'.format(len_old, len_new))\n", + " features = get_features(train, test)\n", + " return train, test, features" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Load ItemPairs_train.csv\n", + "Load ItemInfo_train.csv\n", + "Add text features...\n", + "Merge item 1...\n", + "Merge item 2...\n", + "Create same arrays\n", + "Create train data time: 53.2 seconds\n", + "Load ItemPairs_test.csv\n", + "Load ItemInfo_testcsv\n", + "Add text features...\n", + "Merge item 1...\n", + "Merge item 2...\n", + "Create same arrays\n", + "Create test data time: 19.54 seconds\n", + "Reduce train from 2991396 to 1495698\n", + "Length of train: 1495698\n", + "Length of test: 1044196\n", + "Features [29]: ['categoryID_1', 'categoryID_2', 'categoryID_same', 'lat_1', 'lat_2', 'lat_same', 'len_attrsJSON_1', 'len_attrsJSON_2', 'len_description_1', 'len_description_2', 'len_title_1', 'len_title_2', 'locationID_1', 'locationID_2', 'locationID_same', 'lon_1', 'lon_2', 'lon_same', 'metroID_1', 'metroID_2', 'metroID_same', 'parentCategoryID_1', 'parentCategoryID_2', 'price_1', 'price_2', 'price_same', 'regionID_1', 'regionID_2', 'regionID_same']\n", + "XGBoost params. ETA: 0.1, MAX_DEPTH: 5, SUBSAMPLE: 0.8, COLSAMPLE_BY_TREE: 0.8\n", + "[0]\ttrain-auc:0.740099\teval-auc:0.73741\n", + "Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.\n", + "\n", + "Will train until eval-auc hasn't improved in 20 rounds.\n", + "[1]\ttrain-auc:0.746646\teval-auc:0.744368\n", + "[2]\ttrain-auc:0.751157\teval-auc:0.749253\n", + "[3]\ttrain-auc:0.753171\teval-auc:0.751342\n", + "[4]\ttrain-auc:0.755348\teval-auc:0.753641\n", + "[5]\ttrain-auc:0.757404\teval-auc:0.755759\n", + "[6]\ttrain-auc:0.762091\teval-auc:0.760477\n", + "[7]\ttrain-auc:0.766018\teval-auc:0.764531\n", + "[8]\ttrain-auc:0.767331\teval-auc:0.765837\n", + "[9]\ttrain-auc:0.769261\teval-auc:0.767774\n", + "[10]\ttrain-auc:0.770825\teval-auc:0.769433\n", + "[11]\ttrain-auc:0.772573\teval-auc:0.77112\n", + "[12]\ttrain-auc:0.774139\teval-auc:0.772672\n", + "[13]\ttrain-auc:0.775528\teval-auc:0.774065\n", + "[14]\ttrain-auc:0.776938\teval-auc:0.77547\n", + "[15]\ttrain-auc:0.777749\teval-auc:0.776286\n", + "[16]\ttrain-auc:0.77836\teval-auc:0.776847\n", + "[17]\ttrain-auc:0.778849\teval-auc:0.777333\n", + "[18]\ttrain-auc:0.779524\teval-auc:0.777961\n", + "[19]\ttrain-auc:0.780711\teval-auc:0.779242\n", + "[20]\ttrain-auc:0.781648\teval-auc:0.780194\n", + "[21]\ttrain-auc:0.782454\teval-auc:0.781049\n", + "[22]\ttrain-auc:0.783434\teval-auc:0.782039\n", + "[23]\ttrain-auc:0.78431\teval-auc:0.782888\n", + "[24]\ttrain-auc:0.784882\teval-auc:0.783503\n", + "[25]\ttrain-auc:0.785389\teval-auc:0.783979\n", + "[26]\ttrain-auc:0.786097\teval-auc:0.784739\n", + "[27]\ttrain-auc:0.786545\teval-auc:0.78515\n", + "[28]\ttrain-auc:0.78701\teval-auc:0.785598\n", + "[29]\ttrain-auc:0.78742\teval-auc:0.786003\n", + "[30]\ttrain-auc:0.788039\teval-auc:0.786602\n", + "[31]\ttrain-auc:0.78899\teval-auc:0.78756\n", + "[32]\ttrain-auc:0.790016\teval-auc:0.788568\n", + "[33]\ttrain-auc:0.790487\teval-auc:0.789041\n", + "[34]\ttrain-auc:0.791242\teval-auc:0.789801\n", + "[35]\ttrain-auc:0.791748\teval-auc:0.790282\n", + "[36]\ttrain-auc:0.792856\teval-auc:0.791452\n", + "[37]\ttrain-auc:0.793408\teval-auc:0.792034\n", + "[38]\ttrain-auc:0.794405\teval-auc:0.793005\n", + "[39]\ttrain-auc:0.794863\teval-auc:0.793466\n", + "[40]\ttrain-auc:0.795273\teval-auc:0.793877\n", + "[41]\ttrain-auc:0.795776\teval-auc:0.794382\n", + "[42]\ttrain-auc:0.796016\teval-auc:0.794602\n", + "[43]\ttrain-auc:0.796521\teval-auc:0.795071\n", + "[44]\ttrain-auc:0.797369\teval-auc:0.795948\n", + "[45]\ttrain-auc:0.798169\teval-auc:0.796773\n", + "[46]\ttrain-auc:0.798782\teval-auc:0.797426\n", + "[47]\ttrain-auc:0.799681\teval-auc:0.798316\n", + "[48]\ttrain-auc:0.800083\teval-auc:0.79874\n", + "[49]\ttrain-auc:0.800603\teval-auc:0.799233\n", + "[50]\ttrain-auc:0.801362\teval-auc:0.800015\n", + "[51]\ttrain-auc:0.801839\teval-auc:0.800488\n", + "[52]\ttrain-auc:0.802162\teval-auc:0.800806\n", + "[53]\ttrain-auc:0.8025\teval-auc:0.801118\n", + "[54]\ttrain-auc:0.802771\teval-auc:0.801364\n", + "[55]\ttrain-auc:0.803374\teval-auc:0.801969\n", + "[56]\ttrain-auc:0.803948\teval-auc:0.802537\n", + "[57]\ttrain-auc:0.804204\teval-auc:0.802801\n", + "[58]\ttrain-auc:0.804538\teval-auc:0.803134\n", + "[59]\ttrain-auc:0.804833\teval-auc:0.803438\n", + "[60]\ttrain-auc:0.805368\teval-auc:0.803935\n", + "[61]\ttrain-auc:0.805684\teval-auc:0.804237\n", + "[62]\ttrain-auc:0.806389\teval-auc:0.804969\n", + "[63]\ttrain-auc:0.806807\teval-auc:0.805396\n", + "[64]\ttrain-auc:0.807095\teval-auc:0.805655\n", + "[65]\ttrain-auc:0.807276\teval-auc:0.805809\n", + "[66]\ttrain-auc:0.807724\teval-auc:0.806258\n", + "[67]\ttrain-auc:0.808008\teval-auc:0.806564\n", + "[68]\ttrain-auc:0.8083\teval-auc:0.806859\n", + "[69]\ttrain-auc:0.808688\teval-auc:0.80721\n", + "[70]\ttrain-auc:0.809326\teval-auc:0.807846\n", + "[71]\ttrain-auc:0.809593\teval-auc:0.808107\n", + "[72]\ttrain-auc:0.809826\teval-auc:0.808311\n", + "[73]\ttrain-auc:0.810095\teval-auc:0.808575\n", + "[74]\ttrain-auc:0.81035\teval-auc:0.808798\n", + "[75]\ttrain-auc:0.81061\teval-auc:0.809049\n", + "[76]\ttrain-auc:0.810705\teval-auc:0.809132\n", + "[77]\ttrain-auc:0.811195\teval-auc:0.809631\n", + "[78]\ttrain-auc:0.811278\teval-auc:0.809716\n", + "[79]\ttrain-auc:0.811486\teval-auc:0.809915\n", + "[80]\ttrain-auc:0.811952\teval-auc:0.81035\n", + "[81]\ttrain-auc:0.81222\teval-auc:0.810619\n", + "[82]\ttrain-auc:0.812368\teval-auc:0.810762\n", + "[83]\ttrain-auc:0.812671\teval-auc:0.811032\n", + "[84]\ttrain-auc:0.812852\teval-auc:0.811202\n", + "[85]\ttrain-auc:0.812983\teval-auc:0.811307\n", + "[86]\ttrain-auc:0.813559\teval-auc:0.811873\n", + "[87]\ttrain-auc:0.813774\teval-auc:0.81209\n", + "[88]\ttrain-auc:0.81396\teval-auc:0.812269\n", + "[89]\ttrain-auc:0.814189\teval-auc:0.812484\n", + "[90]\ttrain-auc:0.814711\teval-auc:0.812977\n", + "[91]\ttrain-auc:0.814938\teval-auc:0.813197\n", + "[92]\ttrain-auc:0.815079\teval-auc:0.813318\n", + "[93]\ttrain-auc:0.81526\teval-auc:0.813493\n", + "[94]\ttrain-auc:0.81547\teval-auc:0.813691\n", + "[95]\ttrain-auc:0.815604\teval-auc:0.813827\n", + "[96]\ttrain-auc:0.816007\teval-auc:0.814241\n", + "[97]\ttrain-auc:0.816276\teval-auc:0.814506\n", + "[98]\ttrain-auc:0.816478\teval-auc:0.814658\n", + "[99]\ttrain-auc:0.816598\teval-auc:0.814773\n", + "[100]\ttrain-auc:0.816963\teval-auc:0.815124\n", + "[101]\ttrain-auc:0.817178\teval-auc:0.815318\n", + "[102]\ttrain-auc:0.817382\teval-auc:0.815508\n", + "[103]\ttrain-auc:0.817679\teval-auc:0.815807\n", + "[104]\ttrain-auc:0.817843\teval-auc:0.815967\n", + "[105]\ttrain-auc:0.818028\teval-auc:0.816161\n", + "[106]\ttrain-auc:0.818333\teval-auc:0.816442\n", + "[107]\ttrain-auc:0.818469\teval-auc:0.816558\n", + "[108]\ttrain-auc:0.81863\teval-auc:0.816714\n", + "[109]\ttrain-auc:0.818659\teval-auc:0.81675\n", + "[110]\ttrain-auc:0.818943\teval-auc:0.81699\n", + "[111]\ttrain-auc:0.819115\teval-auc:0.817156\n", + "[112]\ttrain-auc:0.819276\teval-auc:0.817318\n", + "[113]\ttrain-auc:0.81946\teval-auc:0.817508\n", + "[114]\ttrain-auc:0.819687\teval-auc:0.817731\n", + "[115]\ttrain-auc:0.819865\teval-auc:0.817887\n", + "[116]\ttrain-auc:0.820341\teval-auc:0.818403\n", + "[117]\ttrain-auc:0.820409\teval-auc:0.818465\n", + "[118]\ttrain-auc:0.820544\teval-auc:0.818585\n", + "[119]\ttrain-auc:0.82087\teval-auc:0.818912\n", + "[120]\ttrain-auc:0.820955\teval-auc:0.818988\n", + "[121]\ttrain-auc:0.821171\teval-auc:0.819195\n", + "[122]\ttrain-auc:0.821257\teval-auc:0.819264\n", + "[123]\ttrain-auc:0.821386\teval-auc:0.819396\n", + "[124]\ttrain-auc:0.821606\teval-auc:0.819623\n", + "[125]\ttrain-auc:0.821657\teval-auc:0.819677\n", + "[126]\ttrain-auc:0.821762\teval-auc:0.819768\n", + "[127]\ttrain-auc:0.821838\teval-auc:0.819837\n", + "[128]\ttrain-auc:0.822023\teval-auc:0.820016\n", + "[129]\ttrain-auc:0.822255\teval-auc:0.820268\n", + "[130]\ttrain-auc:0.822325\teval-auc:0.820336\n", + "[131]\ttrain-auc:0.822471\teval-auc:0.820463\n", + "[132]\ttrain-auc:0.822584\teval-auc:0.820578\n", + "[133]\ttrain-auc:0.822683\teval-auc:0.820656\n", + "[134]\ttrain-auc:0.822962\teval-auc:0.820912\n", + "[135]\ttrain-auc:0.823128\teval-auc:0.821064\n", + "[136]\ttrain-auc:0.82322\teval-auc:0.821142\n", + "[137]\ttrain-auc:0.82359\teval-auc:0.821504\n", + "[138]\ttrain-auc:0.823678\teval-auc:0.821604\n", + "[139]\ttrain-auc:0.823831\teval-auc:0.821764\n", + "[140]\ttrain-auc:0.823973\teval-auc:0.821896\n", + "[141]\ttrain-auc:0.82408\teval-auc:0.822005\n", + "[142]\ttrain-auc:0.824146\teval-auc:0.822068\n", + "[143]\ttrain-auc:0.824356\teval-auc:0.822274\n", + "[144]\ttrain-auc:0.824555\teval-auc:0.822449\n", + "[145]\ttrain-auc:0.824627\teval-auc:0.822522\n", + "[146]\ttrain-auc:0.824706\teval-auc:0.8226\n", + "[147]\ttrain-auc:0.824809\teval-auc:0.822702\n", + "[148]\ttrain-auc:0.824972\teval-auc:0.822868\n", + "[149]\ttrain-auc:0.825019\teval-auc:0.822904\n", + "[150]\ttrain-auc:0.825119\teval-auc:0.822997\n", + "[151]\ttrain-auc:0.825243\teval-auc:0.823124\n", + "[152]\ttrain-auc:0.825369\teval-auc:0.823239\n", + "[153]\ttrain-auc:0.825699\teval-auc:0.823566\n", + "[154]\ttrain-auc:0.825812\teval-auc:0.823666\n", + "[155]\ttrain-auc:0.825954\teval-auc:0.823803\n", + "[156]\ttrain-auc:0.826056\teval-auc:0.823891\n", + "[157]\ttrain-auc:0.826285\teval-auc:0.824108\n", + "[158]\ttrain-auc:0.826402\teval-auc:0.824208\n", + "[159]\ttrain-auc:0.826452\teval-auc:0.824264\n", + "[160]\ttrain-auc:0.826641\teval-auc:0.824457\n", + "[161]\ttrain-auc:0.826726\teval-auc:0.824534\n", + "[162]\ttrain-auc:0.826754\teval-auc:0.824551\n", + "[163]\ttrain-auc:0.826864\teval-auc:0.824642\n", + "[164]\ttrain-auc:0.826991\teval-auc:0.824776\n", + "[165]\ttrain-auc:0.827216\teval-auc:0.825001\n", + "[166]\ttrain-auc:0.82739\teval-auc:0.825181\n", + "[167]\ttrain-auc:0.827464\teval-auc:0.825246\n", + "[168]\ttrain-auc:0.827713\teval-auc:0.82548\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[169]\ttrain-auc:0.827967\teval-auc:0.825732\n", + "[170]\ttrain-auc:0.828031\teval-auc:0.825774\n", + "[171]\ttrain-auc:0.828338\teval-auc:0.82608\n", + "[172]\ttrain-auc:0.828492\teval-auc:0.826228\n", + "[173]\ttrain-auc:0.828522\teval-auc:0.826259\n", + "[174]\ttrain-auc:0.828626\teval-auc:0.826349\n", + "[175]\ttrain-auc:0.828753\teval-auc:0.826496\n", + "[176]\ttrain-auc:0.828814\teval-auc:0.826544\n", + "[177]\ttrain-auc:0.828956\teval-auc:0.826691\n", + "[178]\ttrain-auc:0.829054\teval-auc:0.82679\n", + "[179]\ttrain-auc:0.829156\teval-auc:0.826883\n", + "[180]\ttrain-auc:0.829251\teval-auc:0.826968\n", + "[181]\ttrain-auc:0.829337\teval-auc:0.82706\n", + "[182]\ttrain-auc:0.829346\teval-auc:0.827075\n", + "[183]\ttrain-auc:0.829479\teval-auc:0.827211\n", + "[184]\ttrain-auc:0.829535\teval-auc:0.827255\n", + "[185]\ttrain-auc:0.829616\teval-auc:0.827324\n", + "[186]\ttrain-auc:0.830468\teval-auc:0.828203\n", + "[187]\ttrain-auc:0.830577\teval-auc:0.828303\n", + "[188]\ttrain-auc:0.830701\teval-auc:0.828421\n", + "[189]\ttrain-auc:0.830756\teval-auc:0.828472\n", + "[190]\ttrain-auc:0.830877\teval-auc:0.828574\n", + "[191]\ttrain-auc:0.831085\teval-auc:0.828777\n", + "[192]\ttrain-auc:0.831328\teval-auc:0.829017\n", + "[193]\ttrain-auc:0.831473\teval-auc:0.829146\n", + "[194]\ttrain-auc:0.831527\teval-auc:0.829188\n", + "[195]\ttrain-auc:0.831606\teval-auc:0.829262\n", + "[196]\ttrain-auc:0.831806\teval-auc:0.829442\n", + "[197]\ttrain-auc:0.831872\teval-auc:0.829503\n", + "[198]\ttrain-auc:0.831999\teval-auc:0.829629\n", + "[199]\ttrain-auc:0.83213\teval-auc:0.829759\n", + "[200]\ttrain-auc:0.832419\teval-auc:0.830015\n", + "[201]\ttrain-auc:0.832684\teval-auc:0.830271\n", + "[202]\ttrain-auc:0.832859\teval-auc:0.830432\n", + "[203]\ttrain-auc:0.833126\teval-auc:0.830691\n", + "[204]\ttrain-auc:0.83334\teval-auc:0.830886\n", + "[205]\ttrain-auc:0.833528\teval-auc:0.83107\n", + "[206]\ttrain-auc:0.833639\teval-auc:0.831177\n", + "[207]\ttrain-auc:0.833753\teval-auc:0.831298\n", + "[208]\ttrain-auc:0.833958\teval-auc:0.831487\n", + "[209]\ttrain-auc:0.834022\teval-auc:0.83154\n", + "[210]\ttrain-auc:0.834182\teval-auc:0.831698\n", + "[211]\ttrain-auc:0.834256\teval-auc:0.831773\n", + "[212]\ttrain-auc:0.834503\teval-auc:0.832042\n", + "[213]\ttrain-auc:0.834792\teval-auc:0.832345\n", + "[214]\ttrain-auc:0.834912\teval-auc:0.832448\n", + "[215]\ttrain-auc:0.835041\teval-auc:0.832573\n", + "[216]\ttrain-auc:0.835236\teval-auc:0.83276\n", + "[217]\ttrain-auc:0.835318\teval-auc:0.832833\n", + "[218]\ttrain-auc:0.835484\teval-auc:0.832983\n", + "[219]\ttrain-auc:0.835848\teval-auc:0.833328\n", + "[220]\ttrain-auc:0.835964\teval-auc:0.833435\n", + "[221]\ttrain-auc:0.836054\teval-auc:0.833518\n", + "[222]\ttrain-auc:0.836149\teval-auc:0.833616\n", + "[223]\ttrain-auc:0.836222\teval-auc:0.833677\n", + "[224]\ttrain-auc:0.836293\teval-auc:0.833739\n", + "[225]\ttrain-auc:0.836498\teval-auc:0.833936\n", + "[226]\ttrain-auc:0.836593\teval-auc:0.834035\n", + "[227]\ttrain-auc:0.836683\teval-auc:0.834119\n", + "[228]\ttrain-auc:0.836747\teval-auc:0.834176\n", + "[229]\ttrain-auc:0.836807\teval-auc:0.834232\n", + "[230]\ttrain-auc:0.836876\teval-auc:0.834296\n", + "[231]\ttrain-auc:0.836911\teval-auc:0.834328\n", + "[232]\ttrain-auc:0.836983\teval-auc:0.834402\n", + "[233]\ttrain-auc:0.837167\teval-auc:0.83456\n", + "[234]\ttrain-auc:0.837265\teval-auc:0.834671\n", + "[235]\ttrain-auc:0.837346\teval-auc:0.834754\n", + "[236]\ttrain-auc:0.837402\teval-auc:0.834798\n", + "[237]\ttrain-auc:0.837534\teval-auc:0.834927\n", + "[238]\ttrain-auc:0.837598\teval-auc:0.834981\n", + "[239]\ttrain-auc:0.837724\teval-auc:0.8351\n", + "[240]\ttrain-auc:0.837885\teval-auc:0.83525\n", + "[241]\ttrain-auc:0.837927\teval-auc:0.835275\n", + "[242]\ttrain-auc:0.838074\teval-auc:0.83541\n", + "[243]\ttrain-auc:0.838121\teval-auc:0.835444\n", + "[244]\ttrain-auc:0.838155\teval-auc:0.835477\n", + "[245]\ttrain-auc:0.838202\teval-auc:0.8355\n", + "[246]\ttrain-auc:0.838399\teval-auc:0.835693\n", + "[247]\ttrain-auc:0.838439\teval-auc:0.835725\n", + "[248]\ttrain-auc:0.838507\teval-auc:0.835781\n", + "[249]\ttrain-auc:0.838585\teval-auc:0.835853\n", + "[250]\ttrain-auc:0.838697\teval-auc:0.835954\n", + "[251]\ttrain-auc:0.838842\teval-auc:0.836078\n", + "[252]\ttrain-auc:0.839028\teval-auc:0.836267\n", + "[253]\ttrain-auc:0.839121\teval-auc:0.836364\n", + "[254]\ttrain-auc:0.83923\teval-auc:0.836471\n", + "[255]\ttrain-auc:0.83939\teval-auc:0.836612\n", + "[256]\ttrain-auc:0.83951\teval-auc:0.836724\n", + "[257]\ttrain-auc:0.839602\teval-auc:0.836798\n", + "[258]\ttrain-auc:0.839711\teval-auc:0.836905\n", + "[259]\ttrain-auc:0.839777\teval-auc:0.836958\n", + "Validating...\n", + "Check error value: 0.836958\n", + "Importance array: [('len_attrsJSON_1', 787), ('len_attrsJSON_2', 757), ('categoryID_2', 742), ('price_2', 681), ('price_1', 548), ('parentCategoryID_2', 430), ('lat_2', 378), ('len_description_2', 332), ('lon_2', 329), ('lat_1', 317), ('len_description_1', 314), ('lon_1', 312), ('len_title_1', 244), ('len_title_2', 222), ('locationID_2', 213), ('locationID_1', 190), ('categoryID_1', 154), ('price_same', 131), ('metroID_1', 127), ('regionID_same', 109), ('metroID_2', 104), ('locationID_same', 91), ('parentCategoryID_1', 77), ('regionID_1', 77), ('lat_same', 60), ('regionID_2', 56), ('lon_same', 36), ('metroID_same', 32)]\n", + "Predict test set...\n", + "Training time: 6.98 minutes\n", + "Real score = 0.8369583096638511\n" + ] + } + ], + "source": [ + "train, test, features = read_test_train()\n", + "print('Length of train: ', len(train))\n", + "print('Length of test: ', len(test))\n", + "print('Features [{}]: {}'.format(len(features), sorted(features)))\n", + "test_prediction, score = run_default_test(train, test, features, 'isDuplicate')\n", + "print('Real score = {}'.format(score))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}