{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# MDI 720 : Statistiques\n", "## GLM\n", "### *Joseph Salmon*\n", "\n", "This notebook reproduces the pictures for the course \"GLM_fr\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt # for plots\n", "from matplotlib import rc\n", "from sklearn.linear_model import RidgeCV\n", "import seaborn as sns\n", "from os import mkdir, path\n", "from mpl_toolkits.mplot3d import Axes3D\n", "# interaction mode better for 3D\n", "\n", "from sklearn.linear_model import LogisticRegression, LinearRegression\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.metrics import accuracy_score\n", "from sklearn import datasets\n", "from functions_classif import class_round, point_plot,\\\n", " frontiere, data_blob_generation, pdf_3D_plot\n", "%matplotlib notebook" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dirname = \"../prebuiltimages/\"\n", "if not path.exists(dirname):\n", " mkdir(dirname)\n", "\n", "\n", "np.random.seed(seed=44)\n", "\n", "###############################################################################\n", "# Plot initialization\n", "\n", "plt.close('all')\n", "dirname = \"../srcimages/\"\n", "imageformat = '.pdf'\n", "\n", "\n", "rc('font', **{'family': 'sans-serif', 'sans-serif': ['Computer Modern Roman']})\n", "params = {'axes.labelsize': 12,\n", " 'font.size': 16,\n", " 'legend.fontsize': 16,\n", " 'text.usetex': True,\n", " 'figure.figsize': (8, 6)}\n", "plt.rcParams.update(params)\n", "\n", "sns.set_context(\"poster\")\n", "sns.set_palette(\"colorblind\")\n", "sns.set_style(\"white\")\n", "sns.axes_style()\n", "\n", "\n", "###############################################################################\n", "# display function:\n", "\n", "saving = False\n", "\n", "\n", "def my_saving_display(fig, dirname, filename, imageformat):\n", " \"\"\"\"Saving with personal function.\"\"\"\n", " filename = filename.replace('.', 'pt') # remove \".\" to avoid floats issues\n", " if saving is True:\n", " dirname + filename + imageformat\n", " image_name = dirname + filename + imageformat\n", " fig.savefig(image_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "digits = datasets.load_digits()\n", "images_and_labels = list(zip(digits.images, digits.target))\n", "fig = plt.figure(figsize=(12, 5))\n", "for index, (image, label) in enumerate(images_and_labels[:10]):\n", " plt.subplot(2, 5, index + 1)\n", " plt.axis('off')\n", " plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')\n", " plt.title('Training: %i' % label)\n", "plt.show()\n", "my_saving_display(fig, dirname, \"digits_plot\", imageformat)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "##############################################################################\n", "# Limits on a simulated example\n", "##############################################################################\n", "\n", "scenario = 4\n", "n_samples = 100\n", "n_test = 30\n", "if scenario == 3:\n", " # Example with 3 classes\n", " mean_0 = [0, 0]\n", " mean_1 = [1, 0]\n", " mean_2 = [0, 1]\n", " means = [mean_0, mean_1, mean_2]\n", " noise_level = 0.15\n", "elif scenario == 4:\n", " # Example with 4 classes\n", " mean_0 = [0, 0]\n", " mean_1 = [1, 0]\n", " mean_2 = [0, 1]\n", " mean_3 = [0.5, 0.5]\n", " means = [mean_0, mean_1, mean_2, mean_3]\n", " noise_level = 0.10\n", "elif scenario == 2:\n", " # Example with 4 classes\n", " mean_0 = [0, 0]\n", " mean_1 = [1, 1]\n", " noise_level = 0.10\n", " means = [mean_0, mean_1]\n", "\n", "n_class = len(means) # get the number of classes considered\n", "\n", "X, y = data_blob_generation(n_samples, means, noise_level)\n", "X_test, y_test = data_blob_generation(n_test, means, noise_level)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "##############################################################################\n", "# Naive linear regression on raw observations\n", "##############################################################################\n", "\n", "resolution_param = 500 # 500 for nice plotting, 50 for fast version\n", "\n", "regr = LinearRegression()\n", "regr.fit(X, y)\n", "y_pred_test = class_round(regr.predict(X_test), n_class)\n", "\n", "# Plotting part\n", "fig0 = plt.figure(figsize=(12, 8))\n", "title = \"Left out accuracy (regression w./o. dummy variables)\" + \\\n", " \": {:.2f}\".format(accuracy_score(y_test, y_pred_test))\n", "plt.title(title)\n", "\n", "\n", "def f(xx):\n", " \"\"\"Classifier\"\"\"\n", " return class_round(regr.predict(xx.reshape(1, -1)), n_class)\n", "frontiere(f, X, y, step=resolution_param)\n", "\n", "# reshape needed for sklearn convention on point to predict evaluations\n", "display1 = np.asarray([0, 0]).reshape(1, -1)\n", "display2 = np.asarray([0.5, 0.5]).reshape(1, -1)\n", "display3 = np.asarray([1, 1]).reshape(1, -1)\n", "\n", "point_plot(fig0, display1, np.zeros(n_class), (90, -60), color_txt='black')\n", "point_plot(fig0, display2, np.zeros(n_class), (-200, -15), color_txt='black')\n", "point_plot(fig0, display3, np.zeros(n_class), (-100, -60), color_txt='black')\n", "\n", "plt.show()\n", "my_saving_display(fig0, dirname, 'RawRegression' + 'n_class' +\n", " str(n_class), imageformat)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "##############################################################################\n", "# Naive linear regression on dummy variables\n", "##############################################################################\n", "\n", "# performing class estimation by performing multi-task learning with dummy\n", "# variables\n", "enc = OneHotEncoder()\n", "enc.fit(y.reshape(-1, 1))\n", "Y = enc.transform(y.reshape(-1, 1)).toarray()\n", "regr_multi = LinearRegression()\n", "regr_multi.fit(X, Y)\n", "proba_vector_test = regr_multi.predict(X_test)\n", "y_pred_test = class_round(regr.predict(X_test), n_class)\n", "\n", "# performance evaluation on new dataset\n", "y_pred_test = np.argmax(proba_vector_test, axis=1)\n", "title = \"Left out accuracy (regression w. dummy variables)\" + \\\n", " \": {:.2f}\".format(accuracy_score(y_test, y_pred_test))\n", "\n", "# Plotting part\n", "fig1 = plt.figure(figsize=(12, 8))\n", "plt.title(title)\n", "\n", "\n", "def f(xx):\n", " \"\"\"Classifier\"\"\"\n", " return np.argmax(regr_multi.predict(xx.reshape(1, -1)))\n", "frontiere(f, X, y, step=resolution_param)\n", "\n", "# naive method: estimate probability with linear model over indicators\n", "proba_vector1 = regr_multi.predict(display1)\n", "label_pred1 = np.argmax(proba_vector1)\n", "\n", "proba_vector2 = regr_multi.predict(display2)\n", "label_pred2 = np.argmax(proba_vector2)\n", "\n", "proba_vector3 = regr_multi.predict(display3)\n", "label_pred3 = np.argmax(proba_vector3)\n", "\n", "point_plot(fig0, display1, proba_vector1[0], (90, -60), color_txt='black')\n", "point_plot(fig0, display2, proba_vector2[0], (-200, -15), color_txt='black')\n", "point_plot(fig0, display3, proba_vector3[0], (-100, -60), color_txt='black')\n", "\n", "plt.show()\n", "my_saving_display(fig1, dirname, 'DummyRegression' + 'n_class' +\n", " str(n_class), imageformat)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "##############################################################################\n", "# Logistic regression\n", "##############################################################################\n", "\n", "clf = LogisticRegression()\n", "clf.fit(X, y)\n", "y_logit_test = clf.predict(X_test)\n", "title = \"Left out accuracy (logistic regression) \" + \\\n", " \": {:.2f}\".format(accuracy_score(y_test, y_logit_test))\n", "fig2 = plt.figure(figsize=(12, 8))\n", "plt.title(title)\n", "\n", "\n", "def f(xx):\n", " \"\"\"Classifier\"\"\"\n", " return int(clf.predict(xx.reshape(1, -1)))\n", "frontiere(f, X, y, step=resolution_param)\n", "\n", "proba_vector_logit1 = np.exp(clf.predict_log_proba(display1))[0]\n", "proba_vector_logit2 = np.exp(clf.predict_log_proba(display2))[0]\n", "proba_vector_logit3 = np.exp(clf.predict_log_proba(display3))[0]\n", "\n", "point_plot(fig0, display1, proba_vector_logit1, (90, -60), color_txt='black')\n", "point_plot(fig0, display2, proba_vector_logit2, (-200, -15), color_txt='black')\n", "point_plot(fig0, display3, proba_vector_logit3, (-100, -60), color_txt='black')\n", "\n", "plt.show()\n", "my_saving_display(fig2, dirname, 'LogisticRegression' + 'n_class' +\n", " str(n_class), imageformat)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "##############################################################################\n", "# Plotting the underlying distribution: mixture of isotropic Gaussian\n", "##############################################################################\n", "step = 250\n", "\n", "fig3 = pdf_3D_plot(means, n_class, noise_level, step)\n", "my_saving_display(fig2, dirname, 'mixt_of_iso_gaussian' + 'n_class' +\n", " str(n_class), imageformat)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "##############################################################################\n", "# Plotting the logistic function\n", "##############################################################################\n", "\n", "t = np.arange(-10, 10, step=0.01)\n", "\n", "\n", "def logistic(t):\n", " \"\"\"logistic\"\"\"\n", " return np.exp(t) / (1 + np.exp(t))\n", "\n", "\n", "fig0 = plt.figure(figsize=(8, 8))\n", "ax1 = plt.subplot(111)\n", "ax1.plot(t, logistic(t), label='logistique')\n", "\n", "ax1.set_ylim(-0.1, 1.1)\n", "ax1.set_xlim(-10, 10)\n", "plt.legend(loc=\"upper left\", fontsize=14)\n", "\n", "plt.show()\n", "my_saving_display(fig2, dirname, 'logistic', imageformat)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 }