diff --git a/Assignment02_nb.ipynb b/Assignment02_nb.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..ee125b320ecd7ff64b22342bb5cf997c6f2c89b2 --- /dev/null +++ b/Assignment02_nb.ipynb @@ -0,0 +1,288 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Assignment Sheet 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook is part of the 2nd assignment. We will train a Decision Tree classifier using the sklearn library on the Iris dataset.\n", + "\n", + "Your task is to complete the missing code, where marked with a **TODO**. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import graphviz\n", + "\n", + "from sklearn import datasets, tree\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report\n", + "from os import system\n", + "from IPython.display import Image\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# For further info https://archive.ics.uci.edu/ml/datasets/iris\n", + "iris = datasets.load_iris()\n", + "X = iris.data \n", + "y = iris.target" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exploratory data analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Quick look into the data structure" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(X.shape)\n", + "print(y.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(X[:5,:])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Using pandas\n", + "data = pd.concat([pd.DataFrame(X),pd.DataFrame(y)], axis=1)\n", + "data.columns=['a','b','c','d','target']\n", + "data.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Exemplary plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(8,6))\n", + "sns.scatterplot(x=X[:,0], y=X[:,1], hue=y)\n", + "plt.xlabel('Sepal length')\n", + "plt.ylabel('Sepal width')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Univariate hist_plot 'sepal_length'\n", + "class0_index = [i for i, j in enumerate(y) if j==0]\n", + "class1_index = [i for i, j in enumerate(y) if j==1]\n", + "class2_index = [i for i, j in enumerate(y) if j==2]\n", + "\n", + "sns.histplot(data=X, x=X[:,0], hue=y, element='step')\n", + "plt.xlabel('Sepal length')\n", + "plt.legend(('class1', 'class2','class3'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Barplot over 'sepal-width'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Boxplot of all features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification using decision trees " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Data preparation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Split data\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", + "X_train.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Train DT classifier using sklearn; Visualization; Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Train a DT classifier " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize: Export to .png image file\n", + "tree.export_graphviz(clf, out_file='tree.dot') \n", + "system(\"dot -Tpng tree.dot -o tree1.png\")\n", + "Image(\"tree1.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Evaluation the classifier's performance" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Train a second DT classifier using the Entropy instead of the Gini-Index (default)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Train the second classifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize #2\n", + "tree.export_graphviz(clf2, out_file='tree2.dot') \n", + "system(\"dot -Tpng tree2.dot -o tree2.png\")\n", + "Image(\"tree2.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Evaluation" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}