From 0cac7793b3097003cc5132b113c58627c5793349 Mon Sep 17 00:00:00 2001 From: manimeun <m.heu1995@gmail.com> Date: Wed, 3 Nov 2021 18:36:23 +0100 Subject: [PATCH] Add Assignment02.ipynb --- Assignment02_nb.ipynb | 465 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 465 insertions(+) create mode 100644 Assignment02_nb.ipynb diff --git a/Assignment02_nb.ipynb b/Assignment02_nb.ipynb new file mode 100644 index 0000000..52ba08e --- /dev/null +++ b/Assignment02_nb.ipynb @@ -0,0 +1,465 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ML4DS - Notebook for Assignment 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Author: Manuel Heurich\n", + "Credit: Maximilian Idahl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook is part of the 2nd assignment. We will train a Decision Tree classifier using the sklearn library on the Iris dataset.\n", + "\n", + "Your task is to complete the missing code, where marked with a **TODO**. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import graphviz\n", + "\n", + "from sklearn import datasets, tree\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report\n", + "from os import system\n", + "from IPython.display import Image\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# For further info https://archive.ics.uci.edu/ml/datasets/iris\n", + "iris = datasets.load_iris()\n", + "X = iris.data \n", + "y = iris.target" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exploratory data analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Quick look into the data structure" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(150, 4)\n", + "(150,)\n" + ] + } + ], + "source": [ + "print(X.shape)\n", + "print(y.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[5.1 3.5 1.4 0.2]\n", + " [4.9 3. 1.4 0.2]\n", + " [4.7 3.2 1.3 0.2]\n", + " [4.6 3.1 1.5 0.2]\n", + " [5. 3.6 1.4 0.2]]\n" + ] + } + ], + "source": [ + "print(X[:5,:])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>a</th>\n", + " <th>b</th>\n", + " <th>c</th>\n", + " <th>d</th>\n", + " <th>target</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>5.1</td>\n", + " <td>3.5</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4.9</td>\n", + " <td>3.0</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>4.7</td>\n", + " <td>3.2</td>\n", + " <td>1.3</td>\n", + " <td>0.2</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4.6</td>\n", + " <td>3.1</td>\n", + " <td>1.5</td>\n", + " <td>0.2</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>5.0</td>\n", + " <td>3.6</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " a b c d target\n", + "0 5.1 3.5 1.4 0.2 0\n", + "1 4.9 3.0 1.4 0.2 0\n", + "2 4.7 3.2 1.3 0.2 0\n", + "3 4.6 3.1 1.5 0.2 0\n", + "4 5.0 3.6 1.4 0.2 0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Using pandas\n", + "data = pd.concat([pd.DataFrame(X),pd.DataFrame(y)], axis=1)\n", + "data.columns=['a','b','c','d','target']\n", + "data.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Exemplary plots" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Sepal width')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 576x432 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(8,6))\n", + "sns.scatterplot(x=X[:,0], y=X[:,1], hue=y)\n", + "plt.xlabel('Sepal length')\n", + "plt.ylabel('Sepal width')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<matplotlib.legend.Legend at 0x139cc7f10>" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Univariate hist_plot 'sepal_length'\n", + "class0_index = [i for i, j in enumerate(y) if j==0]\n", + "class1_index = [i for i, j in enumerate(y) if j==1]\n", + "class2_index = [i for i, j in enumerate(y) if j==2]\n", + "\n", + "sns.histplot(data=X, x=X[:,0], hue=y, element='step')\n", + "plt.xlabel('Sepal length')\n", + "plt.legend(('class1', 'class2','class3'))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Barplot over 'sepal-width'" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Boxplot of all features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification using decision trees " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Data preparation" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(120, 4)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Split data\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", + "X_train.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Train DT classifier using sklearn + Visualization + Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Train a DT classifier \n", + "clf = None" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize clf: Export to .png image file\n", + "\n", + "# tree.export_graphviz(clf, out_file='tree.dot') \n", + "# system(\"dot -Tpng tree.dot -o tree1.png\")\n", + "# Image(\"tree1.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Evaluation of the classifier's performance" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Train a second DT classifier using the Entropy instead of the Gini-Index (default)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Train the second classifier\n", + "clf2 = None" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize clf #2\n", + "\n", + "# tree.export_graphviz(clf2, out_file='tree2.dot') \n", + "# system(\"dot -Tpng tree2.dot -o tree2.png\")\n", + "# Image(\"tree2.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Evaluation of the classifier's performance" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file -- GitLab