From 2f19575eb524f13fc5f56771723bb64cd7f2f44b Mon Sep 17 00:00:00 2001 From: Heurich <manueh51@mars.imp.fu-berlin.de> Date: Fri, 8 Oct 2021 12:14:48 +0200 Subject: [PATCH] Add first assignment and intros --- Assignment01_nb.ipynb | 237 ++++ Pandas_intro.ipynb | 2776 ++++++++++++++++++++++++++++++++++++++ Python_numpy_intro.ipynb | 2500 ++++++++++++++++++++++++++++++++++ 3 files changed, 5513 insertions(+) create mode 100644 Assignment01_nb.ipynb create mode 100644 Pandas_intro.ipynb create mode 100644 Python_numpy_intro.ipynb diff --git a/Assignment01_nb.ipynb b/Assignment01_nb.ipynb new file mode 100644 index 0000000..89bb6d2 --- /dev/null +++ b/Assignment01_nb.ipynb @@ -0,0 +1,237 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "484313d8-8a62-44ed-801e-6e2d903194e7", + "metadata": { + "tags": [] + }, + "source": [ + "# Assignment Sheet 1 " + ] + }, + { + "cell_type": "markdown", + "id": "2dbfe254-4f29-4fbe-a465-3cd86ac7dfb4", + "metadata": {}, + "source": [ + "## Task 4 - Bivariate Descriptors" + ] + }, + { + "cell_type": "markdown", + "id": "52435766-3f59-42f8-aea4-fbddc841d4e6", + "metadata": {}, + "source": [ + "This notebook complements task 4 using the pandas library.\n", + "\n", + "The comments in each cell describe each small task you should complete. :)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c55209e-7c80-4e45-8ee7-1d49f9c0b1c9", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "80fcaacc-4a91-43f3-bd05-315769c3e8be", + "metadata": {}, + "source": [ + "### a) Heart Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4822bf49-37e8-49c8-8bb0-658b7bca611b", + "metadata": {}, + "outputs": [], + "source": [ + "# Dataset: https://www.kaggle.com/ronitf/heart-disease-uci/download\n", + "df = pd.read_csv('your/path/to/heart.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d17d2f78-21fe-495f-8501-c6dc7d5bdec0", + "metadata": {}, + "outputs": [], + "source": [ + "# Take the first 8 rows of the dataframe\n", + "small_df = df.head(8)\n", + "small_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8d417c2-426d-4ce4-ac83-ac94b765107c", + "metadata": {}, + "outputs": [], + "source": [ + "# Check std of each feature of the partial dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0cc68a9-f0d4-4f53-97f7-cdff7358fbfd", + "metadata": {}, + "outputs": [], + "source": [ + "# Check mean of each feature of the partial dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "477dbb53-3570-4287-8367-98d711ad280e", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate correlation for each feature in the dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e90cd02-3152-435b-98ee-05bc18c6c715", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "c5d4157a-f01d-49e2-bbc5-4a9c47572b6c", + "metadata": {}, + "source": [ + "### b) Titanic Dataset (train.csv)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "974508fe-0a81-444d-b296-2ce613d1df3c", + "metadata": {}, + "outputs": [], + "source": [ + "# Dataset: https://www.kaggle.com/c/titanic/data?select=train.csv\n", + "df = pd.read_csv('your/path/to/titanic/train.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76a8b05d-9afc-4bef-b31d-f94853810391", + "metadata": {}, + "outputs": [], + "source": [ + "# Take the first 16 rows of the dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f069ec6-900b-4004-adcb-a2da8b95ff4e", + "metadata": {}, + "outputs": [], + "source": [ + "# Show number of instances for male/female passengers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d9082fc-9da0-4e5b-8f0b-b681d46b7813", + "metadata": {}, + "outputs": [], + "source": [ + "# Show number of instances for each passenger class" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7a1ea7e-c708-417f-9d8f-e19448b0880b", + "metadata": {}, + "outputs": [], + "source": [ + "# Filter and print every contingency table entry for the Chi-Square calculation\n", + "# The features to check are 'Sex' and 'Pclass', equally to Task 4 on the exercise sheet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "643731b9-01e7-40af-90e3-5dc6bd2415ae", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate and print your Chi-Square solution for the\n", + "## Expected Values\n", + "### male and class1\n", + "m_c1 = \n", + "### male and class2\n", + "m_c2 = \n", + "### male and class3\n", + "m_c3 = \n", + "### female and class1\n", + "f_c1 = \n", + "### female and class2\n", + "f_c2 = \n", + "### female and class3\n", + "f_c3 = \n", + "\n", + "print('Expected values of the contingency matrix')\n", + "print(m_c1,' | ', f_c1)\n", + "print('--'*8)\n", + "print(m_c2,' | ', f_c2)\n", + "print('--'*8)\n", + "print(m_c3,' | ', f_c3)\n", + "\n", + "## chi-square calculation\n", + "chi_square = \n", + "\n", + "print('--'*8)\n", + "print('--'*8)\n", + "print('X^2 = ', chi_square)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "794dc6f4-7b1e-44bf-b704-69291dc173b6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Pandas_intro.ipynb b/Pandas_intro.ipynb new file mode 100644 index 0000000..5685f49 --- /dev/null +++ b/Pandas_intro.ipynb @@ -0,0 +1,2776 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Mining I - Tutorial 2\n", + "\n", + "In this second tutorial we will have a look at the pandas library and how to use it to get to know your data. Again, this tutorial only covers a few basics and is far from complete. You can find more detailed tutorials\n", + "[here](https://pandas.pydata.org/pandas-docs/stable/getting_started/tutorials.html). There is also a nice \n", + "[Cheatsheet](http://pandas.pydata.org/Pandas_Cheat_Sheet.pdf) available.\n", + "\n", + "Lets get started by importing pandas:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*The pandas library provides high-performance, easy-to-use data structures and data analysis tools. The main data structure is the DataFrame, which you can think of as an in-memory 2D table (like a spreadsheet, with column names and row labels). Many features available in Excel are available programmatically, such as creating pivot tables, computing columns based on other columns, plotting graphs, etc. You can also group rows by column value, or join tables much like in SQL. Pandas is also great at handling time series.*\n", + "\n", + "The two useful data structures we will mainly use are:\n", + "* [Series](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html#pandas.Series) objects. A Series object is a 1D array, similar to a column in a spreadsheet (with a column name and row labels).\n", + "* [DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/frame.html) objects. This is a 2D table, similar to a spreadsheet (with column names and row labels)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Series\n", + "We can create series objects in many different ways. To get an overview of all the options, have a look at the [documentation](https://pandas.pydata.org/pandas-docs/stable/getting_started/dsintro.html#dsintro)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 1\n", + "1 3\n", + "2 5\n", + "3 1\n", + "4 6\n", + "5 8\n", + "dtype: int64\n", + "alice 1\n", + "bob 3\n", + "charles 5\n", + "darwin 1\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "s = pd.Series([1, 3, 5, 1, 6, 8]) # from list\n", + "print(s)\n", + "s = pd.Series([1, 3, 5, 1], index=[\"alice\", \"bob\", \"charles\", \"darwin\"]) # including index labels\n", + "print(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b 1\n", + "a 0\n", + "c 2\n", + "dtype: int64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s = pd.Series({'b': 1, 'a': 0, 'c': 2}) #from dict\n", + "s" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Series act very similarly to a ndarray, and is a valid argument to most NumPy functions. However, operations such as slicing will also slice the index." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b 2.718282\n", + "a 1.000000\n", + "c 7.389056\n", + "dtype: float64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "np.exp(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b 2\n", + "a 0\n", + "c 4\n", + "dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s + s" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "b 1\n", + "a 0\n", + "dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s[:2]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A Series is also like a fixed-size dict which means you can get and set values by index label:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s['a']" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "'b' in s" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To make it clear when you are accessing by label or by integer location, it is recommended to always use the `loc` attribute when accessing by label, and the `iloc` attribute when accessing by integer location:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "b 1\n", + "a 0\n", + "c 2\n", + "dtype: int64\n", + "accessing index label \"a\": 0\n", + "accessing integer location 1: 0\n" + ] + } + ], + "source": [ + "print(s)\n", + "print('accessing index label \"a\":', s.loc['a'])\n", + "print('accessing integer location 1:', s.iloc[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When an operation involves multiple `Series` objects, `pandas` automatically aligns items by matching index labels. If some index label is not present in one of the involved `Series` the result will be `NaN` (Not-a-Number means missing), so dont forget to set the right index labels to avoid surprising results." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "b 1\n", + "a 0\n", + "c 2\n", + "dtype: int64\n", + "a 1\n", + "c 9\n", + "b 3\n", + "d 24\n", + "dtype: int64\n", + "a 1.0\n", + "b 4.0\n", + "c 11.0\n", + "d NaN\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "print(s)\n", + "s2 = pd.Series({'a': 1, 'c': 9, 'b':3, 'd':24})\n", + "print(s2)\n", + "print(s + s2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataFrame\n", + "A DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects. It is generally the most commonly used pandas object. Like Series, DataFrame accepts many different kinds of input:\n", + "* Dict of 1D arrays, lists, dicts or Series\n", + "* 2D numpy arrays\n", + "* Series\n", + "* Another DataFrame\n", + "* ...\n", + "\n", + "Along with the data, you can optionally pass **index** (row labels) and **columns** (column labels) arguments. If you pass an index and / or columns, you are guaranteeing the index and / or columns of the resulting DataFrame. Thus, a dict of Series plus a specific index will discard all data not matching up to the passed index." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>one</th>\n", + " <th>two</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>a</th>\n", + " <td>1.0</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>b</th>\n", + " <td>2.0</td>\n", + " <td>2.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>c</th>\n", + " <td>3.0</td>\n", + " <td>3.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>d</th>\n", + " <td>NaN</td>\n", + " <td>4.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " one two\n", + "a 1.0 1.0\n", + "b 2.0 2.0\n", + "c 3.0 3.0\n", + "d NaN 4.0" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),\n", + " 'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])})\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The row and column labels can be accessed respectively by accessing the **index** and **columns** attributes:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['a', 'b', 'c', 'd'], dtype='object')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.index" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['one', 'two'], dtype='object')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pandas includes many functions to read a DataFrame from a variety of data formats:\n", + "* `pd.read_csv`\n", + "* `pd.read_json`\n", + "* `pd.read_sql`\n", + "* ...\n", + "\n", + "See [Pandas I/O documentation](https://pandas.pydata.org/pandas-docs/stable/reference/io.html) for more." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data\" # download link of a csv file\n", + "names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class-label']\n", + "df = pd.read_csv(url, names=names)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Viewing data\n", + "Here is how to view the top and bottom rows of the frame:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " <th>class-label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>5.1</td>\n", + " <td>3.5</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4.9</td>\n", + " <td>3.0</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>4.7</td>\n", + " <td>3.2</td>\n", + " <td>1.3</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4.6</td>\n", + " <td>3.1</td>\n", + " <td>1.5</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>5.0</td>\n", + " <td>3.6</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width class-label\n", + "0 5.1 3.5 1.4 0.2 Iris-setosa\n", + "1 4.9 3.0 1.4 0.2 Iris-setosa\n", + "2 4.7 3.2 1.3 0.2 Iris-setosa\n", + "3 4.6 3.1 1.5 0.2 Iris-setosa\n", + "4 5.0 3.6 1.4 0.2 Iris-setosa" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " <th>class-label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>145</th>\n", + " <td>6.7</td>\n", + " <td>3.0</td>\n", + " <td>5.2</td>\n", + " <td>2.3</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " <tr>\n", + " <th>146</th>\n", + " <td>6.3</td>\n", + " <td>2.5</td>\n", + " <td>5.0</td>\n", + " <td>1.9</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " <tr>\n", + " <th>147</th>\n", + " <td>6.5</td>\n", + " <td>3.0</td>\n", + " <td>5.2</td>\n", + " <td>2.0</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " <tr>\n", + " <th>148</th>\n", + " <td>6.2</td>\n", + " <td>3.4</td>\n", + " <td>5.4</td>\n", + " <td>2.3</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " <tr>\n", + " <th>149</th>\n", + " <td>5.9</td>\n", + " <td>3.0</td>\n", + " <td>5.1</td>\n", + " <td>1.8</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width class-label\n", + "145 6.7 3.0 5.2 2.3 Iris-virginica\n", + "146 6.3 2.5 5.0 1.9 Iris-virginica\n", + "147 6.5 3.0 5.2 2.0 Iris-virginica\n", + "148 6.2 3.4 5.4 2.3 Iris-virginica\n", + "149 5.9 3.0 5.1 1.8 Iris-virginica" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.tail()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also take a random sample like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " <th>class-label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4.6</td>\n", + " <td>3.1</td>\n", + " <td>1.5</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>118</th>\n", + " <td>7.7</td>\n", + " <td>2.6</td>\n", + " <td>6.9</td>\n", + " <td>2.3</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " <tr>\n", + " <th>85</th>\n", + " <td>6.0</td>\n", + " <td>3.4</td>\n", + " <td>4.5</td>\n", + " <td>1.6</td>\n", + " <td>Iris-versicolor</td>\n", + " </tr>\n", + " <tr>\n", + " <th>103</th>\n", + " <td>6.3</td>\n", + " <td>2.9</td>\n", + " <td>5.6</td>\n", + " <td>1.8</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " <tr>\n", + " <th>108</th>\n", + " <td>6.7</td>\n", + " <td>2.5</td>\n", + " <td>5.8</td>\n", + " <td>1.8</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width class-label\n", + "3 4.6 3.1 1.5 0.2 Iris-setosa\n", + "118 7.7 2.6 6.9 2.3 Iris-virginica\n", + "85 6.0 3.4 4.5 1.6 Iris-versicolor\n", + "103 6.3 2.9 5.6 1.8 Iris-virginica\n", + "108 6.7 2.5 5.8 1.8 Iris-virginica" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sample(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `info` function gives you a concise summary of a DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 150 entries, 0 to 149\n", + "Data columns (total 5 columns):\n", + "sepal-length 150 non-null float64\n", + "sepal-width 150 non-null float64\n", + "petal-length 150 non-null float64\n", + "petal-width 150 non-null float64\n", + "class-label 150 non-null object\n", + "dtypes: float64(4), object(1)\n", + "memory usage: 5.9+ KB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Selection\n", + "Selecting a single column yields a Series:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 5.1\n", + "1 4.9\n", + "2 4.7\n", + "3 4.6\n", + "4 5.0\n", + "Name: sepal-length, dtype: float64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['sepal-length'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Selecting via [], which slices the rows:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " <th>class-label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>5.1</td>\n", + " <td>3.5</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4.9</td>\n", + " <td>3.0</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>4.7</td>\n", + " <td>3.2</td>\n", + " <td>1.3</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width class-label\n", + "0 5.1 3.5 1.4 0.2 Iris-setosa\n", + "1 4.9 3.0 1.4 0.2 Iris-setosa\n", + "2 4.7 3.2 1.3 0.2 Iris-setosa" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[0:3]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Access by label with `loc`:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4.9" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[1,'sepal-length']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To access entries by position, make use of `iloc`:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " <th>class-label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>5.1</td>\n", + " <td>3.5</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4.9</td>\n", + " <td>3.0</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>4.7</td>\n", + " <td>3.2</td>\n", + " <td>1.3</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width class-label\n", + "0 5.1 3.5 1.4 0.2 Iris-setosa\n", + "1 4.9 3.0 1.4 0.2 Iris-setosa\n", + "2 4.7 3.2 1.3 0.2 Iris-setosa" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.iloc[0:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>petal-width</th>\n", + " <th>class-label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " petal-width class-label\n", + "0 0.2 Iris-setosa\n", + "1 0.2 Iris-setosa\n", + "2 0.2 Iris-setosa" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.iloc[0:3, -2:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use Boolean Indexing to make conditional selections:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " <th>class-label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>105</th>\n", + " <td>7.6</td>\n", + " <td>3.0</td>\n", + " <td>6.6</td>\n", + " <td>2.1</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " <tr>\n", + " <th>117</th>\n", + " <td>7.7</td>\n", + " <td>3.8</td>\n", + " <td>6.7</td>\n", + " <td>2.2</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " <tr>\n", + " <th>118</th>\n", + " <td>7.7</td>\n", + " <td>2.6</td>\n", + " <td>6.9</td>\n", + " <td>2.3</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " <tr>\n", + " <th>122</th>\n", + " <td>7.7</td>\n", + " <td>2.8</td>\n", + " <td>6.7</td>\n", + " <td>2.0</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " <tr>\n", + " <th>131</th>\n", + " <td>7.9</td>\n", + " <td>3.8</td>\n", + " <td>6.4</td>\n", + " <td>2.0</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " <tr>\n", + " <th>135</th>\n", + " <td>7.7</td>\n", + " <td>3.0</td>\n", + " <td>6.1</td>\n", + " <td>2.3</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width class-label\n", + "105 7.6 3.0 6.6 2.1 Iris-virginica\n", + "117 7.7 3.8 6.7 2.2 Iris-virginica\n", + "118 7.7 2.6 6.9 2.3 Iris-virginica\n", + "122 7.7 2.8 6.7 2.0 Iris-virginica\n", + "131 7.9 3.8 6.4 2.0 Iris-virginica\n", + "135 7.7 3.0 6.1 2.3 Iris-virginica" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['sepal-length'] > 7.5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See the indexing documentation [Indexing and Selecting Data](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing) and [MultiIndex / Advanced Indexing](https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced) for more." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A few useful operations\n", + "\n", + "We can create a new column containing computed values like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " <th>class-label</th>\n", + " <th>width-sum</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>5.1</td>\n", + " <td>3.5</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " <td>3.7</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4.9</td>\n", + " <td>3.0</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " <td>3.2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>4.7</td>\n", + " <td>3.2</td>\n", + " <td>1.3</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " <td>3.4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4.6</td>\n", + " <td>3.1</td>\n", + " <td>1.5</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " <td>3.3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>5.0</td>\n", + " <td>3.6</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " <td>3.8</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width class-label \\\n", + "0 5.1 3.5 1.4 0.2 Iris-setosa \n", + "1 4.9 3.0 1.4 0.2 Iris-setosa \n", + "2 4.7 3.2 1.3 0.2 Iris-setosa \n", + "3 4.6 3.1 1.5 0.2 Iris-setosa \n", + "4 5.0 3.6 1.4 0.2 Iris-setosa \n", + "\n", + " width-sum \n", + "0 3.7 \n", + "1 3.2 \n", + "2 3.4 \n", + "3 3.3 \n", + "4 3.8 " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['width-sum'] = df['sepal-width'] + df['petal-width']\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can apply functions to the data using `apply`:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " <th>class-label</th>\n", + " <th>width-sum</th>\n", + " <th>class-label-int</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>74</th>\n", + " <td>6.4</td>\n", + " <td>2.9</td>\n", + " <td>4.3</td>\n", + " <td>1.3</td>\n", + " <td>Iris-versicolor</td>\n", + " <td>4.2</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>4.7</td>\n", + " <td>3.2</td>\n", + " <td>1.6</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " <td>3.4</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>4.8</td>\n", + " <td>3.1</td>\n", + " <td>1.6</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " <td>3.3</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>96</th>\n", + " <td>5.7</td>\n", + " <td>2.9</td>\n", + " <td>4.2</td>\n", + " <td>1.3</td>\n", + " <td>Iris-versicolor</td>\n", + " <td>4.2</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>58</th>\n", + " <td>6.6</td>\n", + " <td>2.9</td>\n", + " <td>4.6</td>\n", + " <td>1.3</td>\n", + " <td>Iris-versicolor</td>\n", + " <td>4.2</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width class-label \\\n", + "74 6.4 2.9 4.3 1.3 Iris-versicolor \n", + "29 4.7 3.2 1.6 0.2 Iris-setosa \n", + "30 4.8 3.1 1.6 0.2 Iris-setosa \n", + "96 5.7 2.9 4.2 1.3 Iris-versicolor \n", + "58 6.6 2.9 4.6 1.3 Iris-versicolor \n", + "\n", + " width-sum class-label-int \n", + "74 4.2 1 \n", + "29 3.4 0 \n", + "30 3.3 0 \n", + "96 4.2 1 \n", + "58 4.2 1 " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def class_labels_to_int(label):\n", + " if label == 'Iris-setosa':\n", + " return 0\n", + " elif label == 'Iris-versicolor':\n", + " return 1\n", + " elif label == 'Iris-virginica':\n", + " return 2\n", + "\n", + "df['class-label-int'] = df['class-label'].apply(class_labels_to_int)\n", + "df.sample(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `get_dummies` function creates a one-hot encoding for categorical columns:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " <th>width-sum</th>\n", + " <th>class-label-int</th>\n", + " <th>class-label_Iris-setosa</th>\n", + " <th>class-label_Iris-versicolor</th>\n", + " <th>class-label_Iris-virginica</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>43</th>\n", + " <td>5.0</td>\n", + " <td>3.5</td>\n", + " <td>1.6</td>\n", + " <td>0.6</td>\n", + " <td>4.1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>5.8</td>\n", + " <td>4.0</td>\n", + " <td>1.2</td>\n", + " <td>0.2</td>\n", + " <td>4.2</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35</th>\n", + " <td>5.0</td>\n", + " <td>3.2</td>\n", + " <td>1.2</td>\n", + " <td>0.2</td>\n", + " <td>3.4</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>69</th>\n", + " <td>5.6</td>\n", + " <td>2.5</td>\n", + " <td>3.9</td>\n", + " <td>1.1</td>\n", + " <td>3.6</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>63</th>\n", + " <td>6.1</td>\n", + " <td>2.9</td>\n", + " <td>4.7</td>\n", + " <td>1.4</td>\n", + " <td>4.3</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width width-sum \\\n", + "43 5.0 3.5 1.6 0.6 4.1 \n", + "14 5.8 4.0 1.2 0.2 4.2 \n", + "35 5.0 3.2 1.2 0.2 3.4 \n", + "69 5.6 2.5 3.9 1.1 3.6 \n", + "63 6.1 2.9 4.7 1.4 4.3 \n", + "\n", + " class-label-int class-label_Iris-setosa class-label_Iris-versicolor \\\n", + "43 0 1 0 \n", + "14 0 1 0 \n", + "35 0 1 0 \n", + "69 1 0 1 \n", + "63 1 0 1 \n", + "\n", + " class-label_Iris-virginica \n", + "43 0 \n", + "14 0 \n", + "35 0 \n", + "69 0 \n", + "63 0 " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.get_dummies(df).sample(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can remove columns using `drop`:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " <th>class-label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>5.1</td>\n", + " <td>3.5</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4.9</td>\n", + " <td>3.0</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>4.7</td>\n", + " <td>3.2</td>\n", + " <td>1.3</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4.6</td>\n", + " <td>3.1</td>\n", + " <td>1.5</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>5.0</td>\n", + " <td>3.6</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width class-label\n", + "0 5.1 3.5 1.4 0.2 Iris-setosa\n", + "1 4.9 3.0 1.4 0.2 Iris-setosa\n", + "2 4.7 3.2 1.3 0.2 Iris-setosa\n", + "3 4.6 3.1 1.5 0.2 Iris-setosa\n", + "4 5.0 3.6 1.4 0.2 Iris-setosa" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.drop(labels=['width-sum', 'class-label-int'], axis=1)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Applying min-max normalization:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>-0.206481</td>\n", + " <td>0.185833</td>\n", + " <td>-0.399774</td>\n", + " <td>-0.416111</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>-0.262037</td>\n", + " <td>-0.022500</td>\n", + " <td>-0.399774</td>\n", + " <td>-0.416111</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>-0.317593</td>\n", + " <td>0.060833</td>\n", + " <td>-0.416723</td>\n", + " <td>-0.416111</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>-0.345370</td>\n", + " <td>0.019167</td>\n", + " <td>-0.382825</td>\n", + " <td>-0.416111</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>-0.234259</td>\n", + " <td>0.227500</td>\n", + " <td>-0.399774</td>\n", + " <td>-0.416111</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width\n", + "0 -0.206481 0.185833 -0.399774 -0.416111\n", + "1 -0.262037 -0.022500 -0.399774 -0.416111\n", + "2 -0.317593 0.060833 -0.416723 -0.416111\n", + "3 -0.345370 0.019167 -0.382825 -0.416111\n", + "4 -0.234259 0.227500 -0.399774 -0.416111" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_features = df.iloc[:, :-1]\n", + "df_norm = (df_features - df_features.mean()) / (df_features.max() - df_features.min())\n", + "df_norm.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Univariate descriptors\n", + "\n", + "The `describe` function computes many of the univariate descriptors for a given DataFrame, giving a nice overview of the main aggregated values over each column:\n", + "* count: number of non-null (not NaN) values\n", + "* mean: mean of non-null values\n", + "* std: standard deviation of non-null values\n", + "* min: minimum of non-null values\n", + "* 25%, 50%, 75%: 25th, 50th and 75th percentile of non-null values\n", + "* max: maximum of non-null values" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>150.000000</td>\n", + " <td>150.000000</td>\n", + " <td>150.000000</td>\n", + " <td>150.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>5.843333</td>\n", + " <td>3.054000</td>\n", + " <td>3.758667</td>\n", + " <td>1.198667</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>0.828066</td>\n", + " <td>0.433594</td>\n", + " <td>1.764420</td>\n", + " <td>0.763161</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>4.300000</td>\n", + " <td>2.000000</td>\n", + " <td>1.000000</td>\n", + " <td>0.100000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>5.100000</td>\n", + " <td>2.800000</td>\n", + " <td>1.600000</td>\n", + " <td>0.300000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>5.800000</td>\n", + " <td>3.000000</td>\n", + " <td>4.350000</td>\n", + " <td>1.300000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>6.400000</td>\n", + " <td>3.300000</td>\n", + " <td>5.100000</td>\n", + " <td>1.800000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>7.900000</td>\n", + " <td>4.400000</td>\n", + " <td>6.900000</td>\n", + " <td>2.500000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width\n", + "count 150.000000 150.000000 150.000000 150.000000\n", + "mean 5.843333 3.054000 3.758667 1.198667\n", + "std 0.828066 0.433594 1.764420 0.763161\n", + "min 4.300000 2.000000 1.000000 0.100000\n", + "25% 5.100000 2.800000 1.600000 0.300000\n", + "50% 5.800000 3.000000 4.350000 1.300000\n", + "75% 6.400000 3.300000 5.100000 1.800000\n", + "max 7.900000 4.400000 6.900000 2.500000" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pandas includes functions to compute data descriptors:" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sepal-length 5.843333\n", + "sepal-width 3.054000\n", + "petal-length 3.758667\n", + "petal-width 1.198667\n", + "dtype: float64" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.mean() # mean for each column" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sepal-length 5.80\n", + "sepal-width 3.00\n", + "petal-length 4.35\n", + "petal-width 1.30\n", + "dtype: float64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.median() # median for each column" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 5.0\n", + "dtype: float64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['sepal-length'].mode() # mode for sepal-length" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sepal-length 0.828066\n", + "sepal-width 0.433594\n", + "petal-length 1.764420\n", + "petal-width 0.763161\n", + "dtype: float64" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.std() # std for each column" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pandas also includes functions to visualize these descriptors.\n", + "\n", + "We can plot histograms for each feature:" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "%matplotlib inline\n", + "df.hist(column=['sepal-length'], bins=20);" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAEFVJREFUeJzt3XuMpXV9x/H3R1gFlpsWOrHautoqtcWKdeINawa11GJra2uKJm2kF8dqQtVeFBsrkv5RrKZVe2WkFlIoiaI0dTcukLhHgheU5Sa4VFsEQW2BVFdGCQJ++8d5to67c2aeMztnZn8771dyss95rt888zuf/Z3fec55UlVIktrxiPUuQJI0HoNbkhpjcEtSYwxuSWqMwS1JjTG4JakxBrckNcbglqTGGNyS1JhDJ7HT4447rrZs2TKJXW843/72t9m8efN6lyEtyva5enbu3HlvVR3fZ92JBPeWLVu49tprJ7HrDWcwGDAzM7PeZUiLsn2uniR39F3XoRJJaozBLUmN6RXcSd6U5JYkNye5JMlhky5MkrS4ZYM7yeOAPwCmq+pE4BDglZMuTJK0uL5DJYcChyc5FDgC+NrkSpIkLWXZq0qq6qtJ3g18BbgfuKKqrth7vSSzwCzA1NQUg8FglUvdmObn5z2XOmDZPtdHlrsDTpJHAx8GTge+CXwIuLSqLhq1zfT0dHk54OrwcisdyGyfqyfJzqqa7rNun6GSFwNfrqp7qupB4CPA8/anQEnSyvX5As5XgOckOYLhUMmLALvTqyzJirbznqHSxrNsj7uqrgEuBa4DPt9tMzfhujacqlr08YS3bB25zNCWNqZeX3mvqrOBsydciySpB785KUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWqMwS1JjTG4JakxBrckNcbglqTGGNyS1BiDW5IaY3BLUmMMbklqjMEtSY0xuCWpMQa3JDVm2eBOckKSGxY8vpXkjWtRnCRpX8veLLiq/gM4CSDJIcBXgcsmXJckaYRxh0peBPxXVd0xiWIkScsbN7hfCVwyiUIkSf0sO1SyR5JHAi8D3jpi+SwwCzA1NcVgMFiN+gSeSx2w5ufnbZ/roHdwA78IXFdV/7PYwqqaA+YApqena2ZmZv+rE2zfhudSB6rBYGD7XAfjDJW8CodJJGnd9epxJzkC+HngtZMtR9KBJsmKtquqVa5Ee/TqcVfVd6rqh6pq96QLknRgqaqRjye8ZevIZZocvzkpSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMaM83vcWgVPP+cKdt//4FjbbDlr21jrH3P4Jm48+9SxtpHUDoN7je2+/0FuP/elvddfyQ/Vjxv0ktriUIkkNcbglqTGGNyS1BiDW5IaY3BLUmMMbklqTK/gTnJskkuT3JpkV5LnTrowSdLi+l7H/V5ge1W9IskjgSMmWJMkaQnLBneSo4EXAGcAVNV3ge9OtixJ0ih9hkqeBNwD/HOS65Ocn2TzhOuSJI3QZ6jkUOBngTOr6pok7wXOAv5s4UpJZoFZgKmpKQaDwSqXevAY59zMz8+v6Fx6/rVWbGtrr09w3wXcVVXXdM8vZRjcP6Cq5oA5gOnp6Rr39zU2jO3bxvrtkZX8Vsm4x5BWzLa2LpYdKqmq/wbuTHJCN+tFwBcmWpUkaaS+V5WcCVzcXVFyG/DbkytJkrSUXsFdVTcA0xOuRZLUg7/HvcaOeupZPO3CfT4iWNqF4x4DoP9vfktqi8G9xu7bda43UpC0X/ytEklqjMEtSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWqMwS1Jjel167IktwP3AQ8DD1WVNw7eD2PfWmz7eOsfc/im8fYvAU8/5wp23//g2NuN056POXwTN5596tjH0A8a556Tp1TVvROrZIMY536TMHxRjLuNtBK7739w7LY27j1RvR/q6nCoRJIa07fHXcAVSQo4r6rm9l4hySwwCzA1NcVgMFi1Ijc6z6XWyrhtbX5+fuxtbM/7r29wn1xVX0vyw8CVSW6tqqsWrtCF+RzA9PR0jfP2SUvYvm2st6LSiq2grY07VGJ7Xh29hkqq6mvdv3cDlwHPmmRRkqTRlg3uJJuTHLVnGjgVuHnShUmSFtdnqGQKuCzJnvX/taq2T7QqSdJIywZ3Vd0GPH0NapEk9eDlgJLUGINbkhpjcEtSYwxuSWqMwS1JjTG4JakxBrckNcbglqTGGNyS1BiDW5IaY3BLUmMMbklqjMEtSY0Z52bBkg5iRz31LJ524Vnjb3jhOMcA8ObX+8vgPkB0v3e++LJ3jt6uqiZQjTai+3ad613eG+FQyQGiqhZ97NixY+QyQ1vamAxuSWqMwS1JjTG4JakxvYM7ySFJrk+ydZIFSZKWNk6P+w3ArkkVIknqp1dwJ3k8w4svz59sOZKk5fTtcb8HeDPwvQnWIknqYdkv4CT5JeDuqtqZZGaJ9WaBWYCpqSkGg8Fq1bihzc/Pey61ZsZtaytpn7bn/dfnm5MnAy9LchpwGHB0kouq6jcXrlRVc8AcwPT0dI3zbSqNNu4306QV275t7LY2dvtcwTG0r2WHSqrqrVX1+KraArwS+PjeoS1JWjtexy1JjRnrR6aqagAMJlKJJKkXe9yS1BiDW5IaY3BLUmMMbklqjMEtSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWrMssGd5LAkn01yY5JbkpyzFoVJkhbX5y7vDwAvrKr5JJuAq5N8rKo+M+HaJEmLWDa4q6qA+e7ppu5RkyxKkjRarzHuJIckuQG4G7iyqq6ZbFmSpFH6DJVQVQ8DJyU5FrgsyYlVdfPCdZLMArMAU1NTDAaD1a51Q5qfn/dcas2M29ZW0j5tz/uvV3DvUVXfTDIAXgLcvNeyOWAOYHp6umZmZlapxI1tMBjgudSa2L5t7LY2dvtcwTG0rz5XlRzf9bRJcjjwYuDWSRcmSVpcnx73Y4ELkxzCMOg/WFVbJ1uWJGmUPleV3AQ8Yw1qkST14DcnJakxBrckNcbglqTGGNyS1BiDW5IaY3BLUmMMbklqjMEtSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUmGWDO8mPJtmRZFeSW5K8YS0KkyQtbtm7vAMPAX9UVdclOQrYmeTKqvrChGuTJC1i2R53VX29qq7rpu8DdgGPm3RhkqTF9elx/78kW4BnANcssmwWmAWYmppiMBjsf3Vifn7ec6k1M25bW0n7tD3vv97BneRI4MPAG6vqW3svr6o5YA5genq6ZmZmVqvGDW0wGOC51JrYvm3stjZ2+1zBMbSvXleVJNnEMLQvrqqPTLYkSdJS+lxVEuCfgF1V9VeTL0mStJQ+Pe6Tgd8CXpjkhu5x2oTrkiSNsOwYd1VdDWQNapEk9eA3JyWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1JjxvqtEkkHty1nbRt/o+39tznm8E3j71/7MLglAXD7uS8de5stZ21b0XbaPw6VSFJjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWqMwS1Jjelzl/cPJLk7yc1rUZAkaWl9etwXAC+ZcB2SpJ6WDe6qugr43zWoRZLUg2PcktSYVfs97iSzwCzA1NQUg8FgtXa9oc3Pz3suta5OOeWUJZfnnYvP37FjxwSqEaxicFfVHDAHMD09XTMzM6u16w1tMBjgudR6qqqRy2yf68OhEklqTJ/LAS8BPg2ckOSuJL87+bIkSaMsO1RSVa9ai0IkSf04VCJJjTG4JakxBrckNcbglqTGGNyS1JgsdXH9inea3APcseo73piOA+5d7yKkEWyfq+cJVXV8nxUnEtxaPUmurarp9a5DWoztc304VCJJjTG4JakxBveBb269C5CWYPtcB45xS1Jj7HFLUmMM7gNAkpkkW0csGyRZ1U/tkxyb5PV9jq+DU5IzkvxIj/UuSPKKReZvmcQNxLu2+Lzljr/RGdwb07HA65ddSwezM4Blg3sdzADPW26ljc7g7inJ5iTbktyY5OYkpyd5ZpJPJNmZ5PIkj+3WHSR5T5JPdes+q5v/rG7e9d2/J4xZw6lJPp3kuiQfSnJkN//2JOd08z+f5Ce7+ccnubKbf16SO5IcB5wL/HiSG5K8q9v9kUkuTXJrkouTZNVOniau6wHfmuTCJDd1f8sjFmujXQ92Gri4awOHJ3l7ks917XVunL9/kkOSvKvb/qYkr+3mz3SvhX3aVZLTunlXJ3lfkq1JtgC/D7ypq+vnukO8oHu93Gbvu1NVPno8gF8H3r/g+THAp4Dju+enAx/opgd71gVeANzcTR8NHNpNvxj4cDc9A2wdcdwBwxfZccBVwOZu/luAt3fTtwNndtOvB87vpv8WeGs3/RKguv1s2VPTguPvBh7P8D/zTwPPX+9z7mOs9rml+/ue3D3/APAny7TR6QXbP2bB9L8Av9xNXwC8YsTx9rTrWeBt3fSjgGuBJ45qV8BhwJ3AE7ttLtnT/oF3AH+84DgXAB/qtv8p4D/X+1wfCI9Vu+fkBvB54N1J3glsBb4BnAhc2XUiDgG+vmD9SwCq6qokRyc5FjgKuDDJkxm+yDaNcfznMGy4n+yO90iGL4Q9PtL9uxP4tW76+cDLuzq2J/nGEvv/bFXdBZDkBoYvzKvHqE/r786q+mQ3fRHwpyzdRhc6JcmbgSOAxwC3AB/tedxTgZ9Z0Bs+Bngy8F0Wb1fzwG1V9eVu/UvobjQ+wr9V1feALySZ6lnTQc3g7qmqvpjkmcBpwF8AVwK3VNVzR22yyPM/B3ZU1cu7t4WDvTdKcjkwBVxbVb+3cBFwZY2+I9ED3b8P8/2/6zjDHQ8smF64D7Vj7zZ3H0u3UQCSHAb8PcMe+J1J3sGwV7xwnWcD53VP3w7ctHAxw3d8l++1zQyLt6txh+EW7sMhPBzj7q37BP47VXUR8G7g2cDxSZ7bLd+U5KcXbHJ6N//5wO6q2s2wJ/LVbvkZix2nqn6hqk7aK7QBPgOcnOQnuv0ekeQpy5R9NfAb3fqnAo/u5t/HsPevg8uP7WmPwKsYtplRbXRhG9gT0vd2n5vsM45cVdd07fKkqvr3vRZfDrwuyabuOE9JsnmJOm8FntR1XqB7rSxSl0awV9Xf04B3Jfke8CDwOuAh4H1JjmF4Lt/D8C0mwDeSfIrhuPbvdPP+kuFQyR8CHx/n4FV1T5IzgEuSPKqb/Tbgi0tsdk63/unAJxi+Tb6vqh5I8skML+f6GLBtnFp0wNoFvDrJecCXgL9hGKqLtdELgH9Mcj/wXOD9DIcDbwc+N+Zxz2c4BHJd9+HjPcCvjlq5qu7P8HLU7UnuBT67YPFHgUuT/Apw5ph1bBh+c3ICkgwYfsBy7TrX8Sjg4ap6qOt1/UNVnbSeNWkyut7r1qo6cZ1L6SXJkVU13wX93wFfqqq/Xu+6WmGP++D2Y8AHkzyC4QdFr1nneqQ9XpPk1Qw/ZL+e74+fqwd73JLUGD+clKTGGNyS1BiDW5IaY3BLUmMMbklqjMEtSY35P0Rs20p2hm7DAAAAAElFTkSuQmCC\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df.boxplot(column=['sepal-length', 'petal-length']);" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAE1CAYAAAD3ZxuaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAEv9JREFUeJzt3XuQZGV9xvHvwyLxLhAWQ1hwiaEiGOXiBk0wRkGQFCrEEm8BV0O5lVQSNaZU1KiQmFJjiRc0mi0uriYKqCCUVokUASPRgMtFLqKFIhoEZVWQFa8Lv/zRZ8KAs3TP9Myc6Xe+n6qp7nPmTPVDde3D6bff95xUFZKkybdN3wEkSfPDQpekRljoktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqREWuiQ1YtvFfLGddtqpVq9evZgvKUkT77LLLvtBVa0cdtyiFvrq1avZuHHjYr6kJE28JN8e5TiHXCSpERa6JDXCQpekRljoktQIC12SGjHSLJckNwKbgbuALVW1JsmOwBnAauBG4HlVddvCxJQkDTObM/SnVdW+VbWm2z4OuKCq9gQu6LYlST0ZZ8jlCGBD93wDcOT4cSRJczXqwqICPpekgH+rqvXAI6vqFoCquiXJzjP9YZJ1wDqA3XfffR4ij271cZ9Z1NdbbDe+7fC+Iyyc4x/Rd4KFdfyP+06woB634XF9R1hQV6+9uu8IMxq10A+sqpu70j4/yddGfYGu/NcDrFmzxjtSS9ICGWnIpapu7h5vBc4GDgC+n2QXgO7x1oUKKUkabmihJ3lIkodNPQcOBa4BzgXWdoetBc5ZqJCSpOFGGXJ5JHB2kqnjP1pVn03yZeDMJMcC3wGOWriYkqRhhhZ6Vd0A7DPD/h8CBy9EKEnS7LlSVJIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktSIkQs9yYokVyT5dLe9R5JLklyf5Iwk2y1cTEnSMLM5Q38FcN207bcD76qqPYHbgGPnM5gkaXZGKvQkq4DDgZO77QAHAZ/oDtkAHLkQASVJoxn1DP3dwGuAu7vt3wRur6ot3fZNwK7znE2SNAtDCz3JM4Fbq+qy6btnOLS28vfrkmxMsnHTpk1zjClJGmaUM/QDgWcnuRE4ncFQy7uB7ZNs2x2zCrh5pj+uqvVVtaaq1qxcuXIeIkuSZjK00KvqdVW1qqpWAy8A/rOq/hy4EHhud9ha4JwFSylJGmqceeivBV6V5BsMxtRPmZ9IkqS52Hb4IfeoqouAi7rnNwAHzH8kSdJcuFJUkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaMbTQkzwwyaVJvpLk2iQndPv3SHJJkuuTnJFku4WPK0namlHO0H8BHFRV+wD7AocleRLwduBdVbUncBtw7MLFlCQNM7TQa+An3eYDup8CDgI+0e3fABy5IAklSSMZaQw9yYokVwK3AucD3wRur6ot3SE3Abtu5W/XJdmYZOOmTZvmI7MkaQYjFXpV3VVV+wKrgAOAvWY6bCt/u76q1lTVmpUrV849qSTpfs1qlktV3Q5cBDwJ2D7Jtt2vVgE3z280SdJsjDLLZWWS7bvnDwKeDlwHXAg8tztsLXDOQoWUJA237fBD2AXYkGQFg/8BnFlVn07yVeD0JG8BrgBOWcCckqQhhhZ6VV0F7DfD/hsYjKdLkpYAV4pKUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktQIC12SGjG00JPsluTCJNcluTbJK7r9OyY5P8n13eMOCx9XkrQ1o5yhbwH+vqr2Ap4E/HWSvYHjgAuqak/ggm5bktSToYVeVbdU1eXd883AdcCuwBHAhu6wDcCRCxVSkjTcrMbQk6wG9gMuAR5ZVbfAoPSBnbfyN+uSbEyycdOmTeOllSRt1ciFnuShwCeBV1bVHaP+XVWtr6o1VbVm5cqVc8koSRrBSIWe5AEMyvw/quqsbvf3k+zS/X4X4NaFiShJGsUos1wCnAJcV1UnTvvVucDa7vla4Jz5jydJGtW2IxxzIHAMcHWSK7t9rwfeBpyZ5FjgO8BRCxNRkjSKoYVeVRcD2cqvD57fOJKkuXKlqCQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNWJooSc5NcmtSa6Ztm/HJOcnub573GFhY0qShhnlDP1DwGH32XcccEFV7Qlc0G1Lkno0tNCr6r+AH91n9xHAhu75BuDIec4lSZqluY6hP7KqbgHoHnfe2oFJ1iXZmGTjpk2b5vhykqRhFvxL0apaX1VrqmrNypUrF/rlJGnZmmuhfz/JLgDd463zF0mSNBdzLfRzgbXd87XAOfMTR5I0V6NMW/wY8CXg95LclORY4G3AIUmuBw7ptiVJPdp22AFV9cKt/Orgec4iSRqDK0UlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjbDQJakRFrokNcJCl6RGWOiS1AgLXZIaYaFLUiMsdElqhIUuSY2w0CWpERa6JDXCQpekRljoktQIC12SGmGhS1IjLHRJaoSFLkmNsNAlqREWuiQ1wkKXpEZY6JLUCAtdkhphoUtSIyx0SWqEhS5JjRir0JMcluTrSb6R5Lj5CiVJmr05F3qSFcD7gT8F9gZemGTv+QomSZqdcc7QDwC+UVU3VNUvgdOBI+YnliRptrYd4293Bf532vZNwBPve1CSdcC6bvMnSb4+xmsudTsBP1isF8vbF+uVloVFfe84IYv2UsvE4v7be8miv3+PGuWgcQp9pv+i+rUdVeuB9WO8zsRIsrGq1vSdQ7PnezfZfP8GxhlyuQnYbdr2KuDm8eJIkuZqnEL/MrBnkj2SbAe8ADh3fmJJkmZrzkMuVbUlyd8A5wErgFOr6tp5SzaZlsXQUqN87yab7x+Qql8b9pYkTSBXikpSIyx0SWqEhS5JjbDQJakRFvoYkqxI8nd955AkcJbL2JJcVFVP7TuHZi/JnsBbGVxc7oFT+6vqd3oLpVlJcjjwWO79/v1jf4n6Nc7Sfw38d5L3AWcAd07trKrL+4ukEZ0GvBl4F/A04KXMfEkLLUFJPgg8mMF7dzLwXODSXkP1zDP0MSW5cIbdVVUHLXoYzUqSy6rqCUmurqrHdfu+UFV/3Hc2DZfkqqp6/LTHhwJnVdWhfWfri2foY6qqp/WdQXP28yTbANd3q56/C+zccyaN7mfd40+T/DbwQ2CPHvP0zi9Fx5TkEUlOTLKx+3lnkkf0nUsjeSWDj+wvB54AHA2s7TWRZuPTSbYH3gFcDtzI4L4My5ZDLmNK8kngGmBDt+sYYJ+qek5/qaTlJclvAA+sqh/3naVPnqGP79FV9ebuzk03VNUJgLMkJkCS87szvKntHZKc12cmjS7JUUke1m2+GjgtyX59ZuqbhT6+nyV58tRGkgO5Z2xPS9tOVXX71EZV3YZj6JPkjVW1ufv39wwGn5I/2HOmXvml6Pj+CtjQjZsH+BHwkl4TaVR3J9m9qr4DkORRzHDXLS1Zd3WPhwMfqKpzkhzfY57eOYY+T5I8HKCq7ug7i0aT5DAG19H+fLfrKcC6qnLYZQIk+TSDmUlPZ/Cl9s+AS6tqn16D9chCn6Mkr7q/31fViYuVRXOXZCfgSQw+XX2pqhbvRtEaS5IHA4cBV1fV9Ul2AR5XVZ/rOVpvHHKZu4cNP0RLUZLHVNXXkuzf7Zq6F+7u3RCMq3wnQFX9NMk3gWckeQbwheVc5uAZupahJOurap2rfCdbklcALwPO6nb9GbC+qk7qL1W/LPQxJVkFnAQcyOALtYuBV1TVTb0GkxqX5CrgD6vqzm77IQyGzR7fb7L+OOQyvtOAjwJHddtHd/sO6S2RRpbkj4DVTPu3UFUf7i2QZiPcM9OF7vmyvriahT6+lVV12rTtDyV5ZW9pNLIkHwEeDVzJPcVQgIU+GU4DLklydrd9JHBqj3l6Z6GP7wdJjgY+1m2/kMFFgrT0rQH2LscdJ1JVnZjkIuDJDM7MX1pVV/Sbql8W+vj+Angfg2tqF/DFbp+WvmuA3wJu6TuIZi/JR6rqGAYX5rrvvmXJQh9Tt8rw2X3n0JzsBHw1yaXAL6Z2VpXv52R47PSNJCsYLDBatiz0MSXZwGBWy+3d9g7AO6vKs/Sl7/i+A2j2krwOeD3woCR3cM8Xob9ksPJ32XLa4piSXFFV+w3bJ2l+JXlrVb2u7xxLiVdbHN823Vk5AEl2xE8+S1qSi7vHzUnumPazuTvj02R4Q5Kjk7wRIMluSQ7oO1SfPEMfU5IXA68DPtHtOgr456r6SH+ppPYl+QBwN3BQVe3VnVh9rqr+oOdovfFMckxV9eEkG4GDGIzlPaeqvtpzLI2g+zR1X5ur6leLHkZz8cSq2j/JFTC4nn2S7foO1ScLfY6SPLyq7uhK4XsMVotO/W7HqvpRf+k0osuB3YDbGPzPeHvgliS3Ai+rqsv6DKehftXNbCmAJCsZnLEvWxb63H0UeCZwGfe+KUK6bW9Dt/R9Fjh76vrnSQ5lcDnWM4F/BZ7YYzYN917gbGDnJP8MPBf4h34j9csxdC1bSTZW1ZqZ9iW5sqr27SubRpPkMcDBDE6kLqiq63qO1CtnuYwpyYHdVd7ovnE/McnufefSSH6U5LVJHtX9vAa4rfsYv6w/uk+CJI8GvlVV72ew6veQ6Tf9Xo4s9PF9APhpkn2A1wDfBpzhMhleBKwCPtX97NbtWwE8r8dcGs0ngbuS/C5wMrAH077LWo4cQx/flqqqJEcA76mqU5Ks7TuU7l93Fv7aqvrbrRzyjcXMozm5u6q2JHkOg397J03NeFmuLPTxbe6WIh8NPKUrigf0nElDVNVdSZb1dT8a8KskLwReDDyr27es/+1Z6ON7PoOP6cdW1fe68fN39JxJo7kiybnAx4E7p3ZW1Vlb/xMtIS8F/pLBQr5vJdkD+PeeM/XKWS5j6M7Gz6uqp/edRbOX5LQZdpcXVps8Sfb35t4W+ti6M7xjqurHfWeRlqskl1fV/n3n6JtDLuP7OXB1kvO598f2l/cXSfcnyWuq6l+SnMS9F4UBvncTalnfS3SKhT6+z3Q/mhxTi0829ppC8+mEvgMsBQ65zIMkDwJ2r6qv951Fo0uy33K/B+UkS3IgcGVV3dnd13d/BtMXv91ztN64sGhMSZ7F4K7xn+229+3G1bX0nZjka0n+Kcljhx+uJWb6or5XM1jU9+F+I/XLQh/f8cABwO0AVXUlgxVrWuKq6mnAU4FNwPokVydZ1hd3mjBbajDEcATw3qp6D/CwnjP1ykIf35YZZrg4jjUhqup7VfVeBvOZrwTe1HMkjW76or7PuKjPQp8P1yR5EbAiyZ7dzIkv9h1KwyXZK8nxSa4B3sfgfVvVcyyN7vnAL+gW9QG7sswX9fml6JiSPBh4A3Bot+s84C1V9fP+UmkUSf4H+Bjw8aq6ue880rgs9DE5U6INrjScHEkurqonJ9nMDDeXqaqH9xStdxb6mJJcCOzC4Hogp1fVtT1H0hy40lAtcAx9TM6UaIYrDSdIkm267z40jYU+D5wp0QRXGk6Qqrob+Ip3B7s3C31MzpSYXNNvHwg8tLt94KN6DaXZ2AW4NskFSc6d+uk7VJ8cQx+TMyUmV5KrgH2AxzNYYXgq8Jyq+pNeg2kkSWZ8n6rq84udZamw0OeRMyUmy9QXoUneBHy3u32gX45qYnm1xfl1MoMLBGkyePvACTTDdMX//xXLfNqihT6/nCkxWbx94ASqqmV9vZb745DLPEpyZFV9qu8ckpYnZ7mMyZkSkyfJxd3j5iR3TPvZnOSOvvNJc+UZ+picKSFpqfAMfXxek3kCudJQLbLQx+c1mSeQKw3VIme5jM+ZEpNraqXhpcCdUzur6tn9RZLmzjF0LVuuNFRrLPQ58prMkpYaC13LjisN1SoLfQxJtgGuqqrf7zuLJDnLZQzOlJC0lDjLZXzOlJC0JFjo4/NON5KWBMfQJakRnqHPkTMlJC01nqFLUiOc5SJJjbDQJakRFrokNcJCl6RG/B/qAEDFYL+j/AAAAABJRU5ErkJggg==\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df['class-label'].value_counts().plot(kind='bar');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bivariate descriptors\n", + "\n", + "Computing pairwise correlation coefficients (measuring linear association between two variables):" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>sepal-length</th>\n", + " <td>1.000000</td>\n", + " <td>-0.109369</td>\n", + " <td>0.871754</td>\n", + " <td>0.817954</td>\n", + " </tr>\n", + " <tr>\n", + " <th>sepal-width</th>\n", + " <td>-0.109369</td>\n", + " <td>1.000000</td>\n", + " <td>-0.420516</td>\n", + " <td>-0.356544</td>\n", + " </tr>\n", + " <tr>\n", + " <th>petal-length</th>\n", + " <td>0.871754</td>\n", + " <td>-0.420516</td>\n", + " <td>1.000000</td>\n", + " <td>0.962757</td>\n", + " </tr>\n", + " <tr>\n", + " <th>petal-width</th>\n", + " <td>0.817954</td>\n", + " <td>-0.356544</td>\n", + " <td>0.962757</td>\n", + " <td>1.000000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width\n", + "sepal-length 1.000000 -0.109369 0.871754 0.817954\n", + "sepal-width -0.109369 1.000000 -0.420516 -0.356544\n", + "petal-length 0.871754 -0.420516 1.000000 0.962757\n", + "petal-width 0.817954 -0.356544 0.962757 1.000000" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.corr()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Visual inspection of correlation:" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x720 with 16 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "pd.plotting.scatter_matrix(df, figsize=(10,10));" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For categorical features, we can use `pd.crosstab` to get the contingency table:" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>color</th>\n", + " <th>gender</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>blue</td>\n", + " <td>male</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>yellow</td>\n", + " <td>female</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>red</td>\n", + " <td>male</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>yellow</td>\n", + " <td>male</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>yellow</td>\n", + " <td>female</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>red</td>\n", + " <td>male</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " color gender\n", + "0 blue male\n", + "1 yellow female\n", + "2 red male\n", + "3 yellow male\n", + "4 yellow female\n", + "5 red male" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame([['blue', 'male'], ['yellow', 'female'], ['red', 'male'], ['yellow', 'male'], ['yellow', 'female'], ['red', 'male']])\n", + "df.columns = ['color', 'gender']\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th>gender</th>\n", + " <th>female</th>\n", + " <th>male</th>\n", + " <th>All</th>\n", + " </tr>\n", + " <tr>\n", + " <th>color</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>blue</th>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>red</th>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>yellow</th>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>All</th>\n", + " <td>2</td>\n", + " <td>4</td>\n", + " <td>6</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "gender female male All\n", + "color \n", + "blue 0 1 1\n", + "red 0 2 2\n", + "yellow 2 1 3\n", + "All 2 4 6" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.crosstab(df['color'], df['gender'], margins=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Handling Missing data\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>0</th>\n", + " <th>1</th>\n", + " <th>2</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>2.0</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4</td>\n", + " <td>NaN</td>\n", + " <td>6</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " 0 1 2\n", + "0 1 2.0 3\n", + "1 4 NaN 6" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame([[1,2,3],\n", + " [4,np.nan,6]])\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To drop any rows that have missing data:" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>0</th>\n", + " <th>1</th>\n", + " <th>2</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>2.0</td>\n", + " <td>3</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " 0 1 2\n", + "0 1 2.0 3" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dropna(how='any')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Filling missing data with a constant:" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>0</th>\n", + " <th>1</th>\n", + " <th>2</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>2.0</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4</td>\n", + " <td>5.0</td>\n", + " <td>6</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " 0 1 2\n", + "0 1 2.0 3\n", + "1 4 5.0 6" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.fillna(value=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Filling missing data with column mean:" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>0</th>\n", + " <th>1</th>\n", + " <th>2</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>2.0</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4</td>\n", + " <td>2.0</td>\n", + " <td>6</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " 0 1 2\n", + "0 1 2.0 3\n", + "1 4 2.0 6" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.fillna(df.mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To get the boolean mask where values are nan:" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>0</th>\n", + " <th>1</th>\n", + " <th>2</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>False</td>\n", + " <td>True</td>\n", + " <td>False</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " 0 1 2\n", + "0 False False False\n", + "1 False True False" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.isna(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What next?\n", + "\n", + "As you probably noticed by now, pandas is quite a large library with many features. Although we went through the most important features, there is still a lot to discover. Probably the best way to learn more is to get your hands dirty with some real-life data. It is also a good idea to go through pandas' excellent [documentation](http://pandas.pydata.org/pandas-docs/stable/index.html), in particular the [Cookbook](http://pandas.pydata.org/pandas-docs/stable/user_guide/cookbook.html)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Python_numpy_intro.ipynb b/Python_numpy_intro.ipynb new file mode 100644 index 0000000..7f3fa38 --- /dev/null +++ b/Python_numpy_intro.ipynb @@ -0,0 +1,2500 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Mining I - Tutorial 1\n", + "\n", + "In this first tutorial we will cover some python and numpy basics, as well as a short introduction of the Iris dataset.\n", + "\n", + "The Python and Numpy part of this tutorial is based on the jupyter notebook version of the CS231n Python + Numpy Tutorial created by Justin Johnson, which can be found [here](https://github.com/kuleshov/cs228-material/blob/master/tutorials/python/cs228-python-tutorial.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Python" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Python is a high-level, dynamically typed multiparadigm programming language. Python code is often said to be almost like pseudocode, since it allows you to express very powerful ideas in very few lines of code while being very readable. As an example, here is an implementation of the classic quicksort algorithm in Python:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 1, 2, 3, 6, 8, 10]\n" + ] + } + ], + "source": [ + "def quicksort(arr):\n", + " if len(arr) <= 1:\n", + " return arr\n", + " pivot = arr[len(arr) // 2]\n", + " left = [x for x in arr if x < pivot]\n", + " middle = [x for x in arr if x == pivot]\n", + " right = [x for x in arr if x > pivot]\n", + " return quicksort(left) + middle + quicksort(right)\n", + "\n", + "print(quicksort([3,6,8,10,1,2,1]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Python versions\n", + "There are currently two different supported versions of Python, 2 and 3. \n", + "Somewhat confusingly, Python 3 introduced many backwards-incompatible changes to the language, so code written for e.g. 2.7 may not work under 3.6 and vice versa. For this class all code will use Python >= 3.6.\n", + "\n", + "You can check your Python version at the command line by running `python --version`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Basic data types\n", + "Like most languages, Python has a number of basic types including integers, floats, booleans, and strings. These data types behave in ways that are familiar from other programming languages.\n", + "\n", + "**Numbers**: Integers and floats work as you would expect from other languages:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'int'>\n", + "3\n", + "4\n", + "2\n", + "6\n", + "9\n", + "4\n", + "8\n", + "<class 'float'>\n", + "2.5 3.5 5.0 6.25\n" + ] + } + ], + "source": [ + "x = 3\n", + "print(type(x)) # Prints \"<class 'int'>\"\n", + "print(x) # Prints \"3\"\n", + "print(x + 1) # Addition; prints \"4\"\n", + "print(x - 1) # Subtraction; prints \"2\"\n", + "print(x * 2) # Multiplication; prints \"6\"\n", + "print(x ** 2) # Exponentiation; prints \"9\"\n", + "x += 1\n", + "print(x) # Prints \"4\"\n", + "x *= 2\n", + "print(x) # Prints \"8\"\n", + "y = 2.5\n", + "print(type(y)) # Prints \"<class 'float'>\"\n", + "print(y, y + 1, y * 2, y ** 2) # Prints \"2.5 3.5 5.0 6.25\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that unlike many languages, Python does not have unary increment (x++) or decrement (x--) operators.\n", + "\n", + "Python also has built-in types for long integers and complex numbers; you can find all of the details in the [documentation](https://docs.python.org/3.7/library/stdtypes.html#numeric-types-int-float-complex)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Booleans**: Python implements all of the usual operators for Boolean logic, but uses English words rather than symbols (`&&`, `||`, etc.):" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'bool'>\n", + "False\n", + "True\n", + "False\n", + "True\n" + ] + } + ], + "source": [ + "t = True\n", + "f = False\n", + "print(type(t)) # Prints \"<class 'bool'>\"\n", + "print(t and f) # Logical AND; prints \"False\"\n", + "print(t or f) # Logical OR; prints \"True\"\n", + "print(not t) # Logical NOT; prints \"False\"\n", + "print(t != f) # Logical XOR; prints \"True\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Strings**: Python has great support for strings:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hello\n", + "5\n", + "hello world\n", + "hello world 12\n", + "hello world 12\n" + ] + } + ], + "source": [ + "hello = 'hello' # String literals can use single quotes\n", + "world = \"world\" # or double quotes; it does not matter.\n", + "print(hello) # Prints \"hello\"\n", + "print(len(hello)) # String length; prints \"5\"\n", + "hw = hello + ' ' + world # String concatenation\n", + "print(hw) # prints \"hello world\"\n", + "hw12 = '%s %s %d' % (hello, world, 12) # sprintf style string formatting\n", + "print(hw12) # prints \"hello world 12\"\n", + "hw12 = f'{hello} {world} {12}' # f-strings, a new and improved way of formatting strings, available in python >= 3.6\n", + "print(hw12)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "String objects have a bunch of useful methods; for example:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello\n", + "HELLO\n", + " hello\n", + " hello \n", + "he(ell)(ell)o\n", + "world\n" + ] + } + ], + "source": [ + "s = \"hello\"\n", + "print(s.capitalize()) # Capitalize a string; prints \"Hello\"\n", + "print(s.upper()) # Convert a string to uppercase; prints \"HELLO\"\n", + "print(s.rjust(7)) # Right-justify a string, padding with spaces; prints \" hello\"\n", + "print(s.center(7)) # Center a string, padding with spaces; prints \" hello \"\n", + "print(s.replace('l', '(ell)')) # Replace all instances of one substring with another;\n", + " # prints \"he(ell)(ell)o\"\n", + "print(' world '.strip()) # Strip leading and trailing whitespace; prints \"world\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can find a list of all string methods in the [documentation](https://docs.python.org/3.7/library/stdtypes.html#string-methods)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Containers\n", + "Python includes several built-in container types: lists, dictionaries, sets, and tuples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Lists\n", + "A list is the Python equivalent of an array, but is resizeable and can contain elements of different types:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[3, 1, 2] 2\n", + "2\n", + "[3, 1, 'foo']\n", + "[3, 1, 'foo', 'bar']\n", + "bar [3, 1, 'foo']\n" + ] + } + ], + "source": [ + "xs = [3, 1, 2] # Create a list\n", + "print(xs, xs[2]) # Prints \"[3, 1, 2] 2\"\n", + "print(xs[-1]) # Negative indices count from the end of the list; prints \"2\"\n", + "xs[2] = 'foo' # Lists can contain elements of different types\n", + "print(xs) # Prints \"[3, 1, 'foo']\"\n", + "xs.append('bar') # Add a new element to the end of the list\n", + "print(xs) # Prints \"[3, 1, 'foo', 'bar']\"\n", + "x = xs.pop() # Remove and return the last element of the list\n", + "print(x, xs) # Prints \"bar [3, 1, 'foo']\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As usual, you can find all the gory details about lists in the [documentation](https://docs.python.org/3.7/tutorial/datastructures.html#more-on-lists)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Slicing**: In addition to accessing list elements one at a time, Python provides concise syntax to access sublists; this is known as slicing:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0, 1, 2, 3, 4]\n", + "[2, 3]\n", + "[2, 3, 4]\n", + "[0, 1]\n", + "[0, 1, 2, 3, 4]\n", + "[0, 1, 2, 3]\n", + "[0, 1, 8, 9, 4]\n" + ] + } + ], + "source": [ + "nums = list(range(5)) # range is a built-in function that creates a list of integers\n", + "print(nums) # Prints \"[0, 1, 2, 3, 4]\"\n", + "print(nums[2:4]) # Get a slice from index 2 to 4 (exclusive); prints \"[2, 3]\"\n", + "print(nums[2:]) # Get a slice from index 2 to the end; prints \"[2, 3, 4]\"\n", + "print(nums[:2]) # Get a slice from the start to index 2 (exclusive); prints \"[0, 1]\"\n", + "print(nums[:]) # Get a slice of the whole list; prints \"[0, 1, 2, 3, 4]\"\n", + "print(nums[:-1]) # Slice indices can be negative; prints \"[0, 1, 2, 3]\"\n", + "nums[2:4] = [8, 9] # Assign a new sublist to a slice\n", + "print(nums) # Prints \"[0, 1, 8, 9, 4]\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will see slicing again in the context of numpy arrays." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Loops**: You can loop over the elements of a list like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cat\n", + "dog\n", + "monkey\n" + ] + } + ], + "source": [ + "animals = ['cat', 'dog', 'monkey']\n", + "for animal in animals:\n", + " print(animal)\n", + "# Prints \"cat\", \"dog\", \"monkey\", each on its own line." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want access to the index of each element within the body of a loop, use the built-in `enumerate` function:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#1: cat\n", + "#2: dog\n", + "#3: monkey\n" + ] + } + ], + "source": [ + "animals = ['cat', 'dog', 'monkey']\n", + "for idx, animal in enumerate(animals):\n", + " print(f'#{idx +1}: {animal}')\n", + "# Prints \"#1: cat\", \"#2: dog\", \"#3: monkey\", each on its own line" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**List comprehensions**: When programming, frequently we want to transform one type of data into another. As a simple example, consider the following code that computes square numbers:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0, 1, 4, 9, 16]\n" + ] + } + ], + "source": [ + "nums = [0, 1, 2, 3, 4]\n", + "squares = []\n", + "for x in nums:\n", + " squares.append(x ** 2)\n", + "print(squares) # Prints [0, 1, 4, 9, 16]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can make this code simpler using a **list comprehension**:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0, 1, 4, 9, 16]\n" + ] + } + ], + "source": [ + "nums = [0, 1, 2, 3, 4]\n", + "squares = [x ** 2 for x in nums]\n", + "print(squares) # Prints [0, 1, 4, 9, 16]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "List comprehensions can also contain conditions:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0, 4, 16]\n" + ] + } + ], + "source": [ + "nums = [0, 1, 2, 3, 4]\n", + "even_squares = [x ** 2 for x in nums if x % 2 == 0]\n", + "print(even_squares) # Prints \"[0, 4, 16]\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Dictionaries\n", + "A dictionary stores (key, value) pairs, similar to a `Map` in Java or an object in Javascript. You can use it like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cute\n", + "True\n", + "wet\n", + "N/A\n", + "wet\n", + "N/A\n" + ] + } + ], + "source": [ + "d = {'cat': 'cute', 'dog': 'furry'} # Create a new dictionary with some data\n", + "print(d['cat']) # Get an entry from a dictionary; prints \"cute\"\n", + "print('cat' in d) # Check if a dictionary has a given key; prints \"True\"\n", + "d['fish'] = 'wet' # Set an entry in a dictionary\n", + "print(d['fish']) # Prints \"wet\"\n", + "# print(d['monkey']) # KeyError: 'monkey' not a key of d\n", + "print(d.get('monkey', 'N/A')) # Get an element with a default; prints \"N/A\"\n", + "print(d.get('fish', 'N/A')) # Get an element with a default; prints \"wet\"\n", + "del d['fish'] # Remove an element from a dictionary\n", + "print(d.get('fish', 'N/A')) # \"fish\" is no longer a key; prints \"N/A\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can find all you need to know about dictionaries in the [documentation](https://docs.python.org/3.7/library/stdtypes.html#dict)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Loops**: It is easy to iterate over the keys in a dictionary:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A person has 2 legs\n", + "A cat has 4 legs\n", + "A spider has 8 legs\n" + ] + } + ], + "source": [ + "d = {'person': 2, 'cat': 4, 'spider': 8}\n", + "for animal in d:\n", + " legs = d[animal]\n", + " print(f'A {animal} has {legs} legs')\n", + "# Prints \"A person has 2 legs\", \"A cat has 4 legs\", \"A spider has 8 legs\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want access to keys and their corresponding values, use the items method:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A person has 2 legs\n", + "A cat has 4 legs\n", + "A spider has 8 legs\n" + ] + } + ], + "source": [ + "d = {'person': 2, 'cat': 4, 'spider': 8}\n", + "for animal, legs in d.items():\n", + " print(f'A {animal} has {legs} legs')\n", + "# Prints \"A person has 2 legs\", \"A cat has 4 legs\", \"A spider has 8 legs\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Dictionary comprehensions**: These are similar to list comprehensions, but allow you to easily construct dictionaries. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{0: 0, 2: 4, 4: 16}\n" + ] + } + ], + "source": [ + "nums = [0, 1, 2, 3, 4]\n", + "even_num_to_square = {x: x ** 2 for x in nums if x % 2 == 0}\n", + "print(even_num_to_square) # Prints \"{0: 0, 2: 4, 4: 16}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Sets\n", + "A set is an unordered collection of distinct elements. As a simple example, consider the following:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "False\n", + "True\n", + "3\n", + "3\n", + "2\n" + ] + } + ], + "source": [ + "animals = {'cat', 'dog'}\n", + "print('cat' in animals) # Check if an element is in a set; prints \"True\"\n", + "print('fish' in animals) # prints \"False\"\n", + "animals.add('fish') # Add an element to a set\n", + "print('fish' in animals) # Prints \"True\"\n", + "print(len(animals)) # Number of elements in a set; prints \"3\"\n", + "animals.add('cat') # Adding an element that is already in the set does nothing\n", + "print(len(animals)) # Prints \"3\"\n", + "animals.remove('cat') # Remove an element from a set\n", + "print(len(animals)) # Prints \"2\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As usual, everything you want to know about sets can be found in the [documentation](https://docs.python.org/3.7/library/stdtypes.html#set)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Loops**: Iterating over a set has the same syntax as iterating over a list; however since sets are unordered, you cannot make assumptions about the order in which you visit the elements of the set:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#1: fish\n", + "#2: cat\n", + "#3: dog\n" + ] + } + ], + "source": [ + "animals = {'cat', 'dog', 'fish'}\n", + "for idx, animal in enumerate(animals):\n", + " print(f'#{idx +1}: {animal}')\n", + "# Prints items in some undefined order" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Set comprehensions**: Like lists and dictionaries, we can easily construct sets using set comprehensions:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{0, 1, 2, 3, 4, 5}\n" + ] + } + ], + "source": [ + "from math import sqrt\n", + "nums = {int(sqrt(x)) for x in range(30)}\n", + "print(nums) # Prints \"{0, 1, 2, 3, 4, 5}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Tuples\n", + "A tuple is an (immutable) ordered list of values. A tuple is in many ways similar to a list; one of the most important differences is that tuples can be used as keys in dictionaries and as elements of sets, while lists cannot. Here is a trivial example:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'tuple'>\n", + "5\n", + "1\n" + ] + } + ], + "source": [ + "d = {(x, x + 1): x for x in range(10)} # Create a dictionary with tuple keys\n", + "t = (5, 6) # Create a tuple\n", + "print(type(t)) # Prints \"<class 'tuple'>\"\n", + "print(d[t]) # Prints \"5\"\n", + "print(d[(1, 2)]) # Prints \"1\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The [documentation](https://docs.python.org/3.7/tutorial/datastructures.html#tuples-and-sequences) has more information about tuples." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Functions\n", + "Python functions are defined using the `def` keyword. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "negative\n", + "zero\n", + "positive\n" + ] + } + ], + "source": [ + "def sign(x):\n", + " if x > 0:\n", + " return 'positive'\n", + " elif x < 0:\n", + " return 'negative'\n", + " else:\n", + " return 'zero'\n", + "\n", + "for x in [-1, 0, 1]:\n", + " print(sign(x))\n", + "# Prints \"negative\", \"zero\", \"positive\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will often define functions to take optional keyword arguments, like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello, Bob\n", + "HELLO, FRED!\n" + ] + } + ], + "source": [ + "def hello(name, loud=False):\n", + " if loud:\n", + " print(f'HELLO, {name.upper()}!')\n", + " else:\n", + " print(f'Hello, {name}')\n", + "\n", + "hello('Bob') # Prints \"Hello, Bob\"\n", + "hello('Fred', loud=True) # Prints \"HELLO, FRED!\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is a lot more information about Python functions in the [documentation](https://docs.python.org/3.7/tutorial/controlflow.html#defining-functions)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Classes\n", + "The syntax for defining classes in Python is straightforward:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello, Fred\n", + "HELLO, FRED!\n" + ] + } + ], + "source": [ + "class Greeter:\n", + "\n", + " # Constructor\n", + " def __init__(self, name):\n", + " self.name = name # Create an instance variable\n", + "\n", + " # Instance method\n", + " def greet(self, loud=False):\n", + " if loud:\n", + " print(f'HELLO, {self.name.upper()}!')\n", + " else:\n", + " print(f'Hello, {self.name}')\n", + "\n", + "g = Greeter('Fred') # Construct an instance of the Greeter class\n", + "g.greet() # Call an instance method; prints \"Hello, Fred\"\n", + "g.greet(loud=True) # Call an instance method; prints \"HELLO, FRED!\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can read a lot more about Python classes in the [documentation](https://docs.python.org/3.7/tutorial/classes.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Numpy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[Numpy](http://www.numpy.org/) is the core library for scientific computing in Python. It provides a high-performance multidimensional array object, and tools for working with these arrays." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lets import `numpy`. most people import it as `np`:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Arrays\n", + "A numpy array is a grid of values, all of the same type, and is indexed by a tuple of nonnegative integers. The number of dimensions is the rank of the array; the shape of an array is a tuple of integers giving the size of the array along each dimension." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can initialize numpy arrays from nested Python lists, and access elements using square brackets:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'numpy.ndarray'>\n", + "(3,)\n", + "1 2 3\n", + "[5 2 3]\n", + "(2, 3)\n", + "1 2 4\n" + ] + } + ], + "source": [ + "a = np.array([1, 2, 3]) # Create a rank 1 array\n", + "print(type(a)) # Prints \"<class 'numpy.ndarray'>\"\n", + "print(a.shape) # Prints \"(3,)\"\n", + "print(a[0], a[1], a[2]) # Prints \"1 2 3\"\n", + "a[0] = 5 # Change an element of the array\n", + "print(a) # Prints \"[5, 2, 3]\"\n", + "\n", + "b = np.array([[1,2,3],[4,5,6]]) # Create a rank 2 array\n", + "print(b.shape) # Prints \"(2, 3)\"\n", + "print(b[0, 0], b[0, 1], b[1, 0]) # Prints \"1 2 4\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Numpy also provides many functions to create arrays:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0. 0.]\n", + " [0. 0.]]\n", + "[[1. 1.]]\n", + "[[7 7]\n", + " [7 7]]\n", + "[[1. 0.]\n", + " [0. 1.]]\n", + "[[0.20998725 0.2464906 ]\n", + " [0.16344505 0.70387334]]\n" + ] + } + ], + "source": [ + "a = np.zeros((2,2)) # Create an array of all zeros\n", + "print(a) # Prints \"[[ 0. 0.]\n", + " # [ 0. 0.]]\"\n", + "\n", + "b = np.ones((1,2)) # Create an array of all ones\n", + "print(b) # Prints \"[[ 1. 1.]]\"\n", + "\n", + "c = np.full((2,2), 7) # Create a constant array\n", + "print(c) # Prints \"[[ 7. 7.]\n", + " # [ 7. 7.]]\"\n", + "\n", + "d = np.eye(2) # Create a 2x2 identity matrix\n", + "print(d) # Prints \"[[ 1. 0.]\n", + " # [ 0. 1.]]\"\n", + "\n", + "e = np.random.random((2,2)) # Create an array filled with random values\n", + "print(e) # Might print \"[[ 0.91940167 0.08143941]\n", + " # [ 0.68744134 0.87236687]]\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can read about other methods of array creation in the [documentation](http://docs.scipy.org/doc/numpy/user/basics.creation.html#arrays-creation)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Array Indexing\n", + "Numpy offers several ways to index into arrays." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Slicing**: Similar to Python lists, numpy arrays can be sliced. Since arrays may be multidimensional, you must specify a slice for each dimension of the array:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2\n", + "77\n" + ] + } + ], + "source": [ + "# Create the following rank 2 array with shape (3, 4)\n", + "# [[ 1 2 3 4]\n", + "# [ 5 6 7 8]\n", + "# [ 9 10 11 12]]\n", + "a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])\n", + "\n", + "# Use slicing to pull out the subarray consisting of the first 2 rows\n", + "# and columns 1 and 2; b is the following array of shape (2, 2):\n", + "# [[2 3]\n", + "# [6 7]]\n", + "b = a[:2, 1:3]\n", + "\n", + "# A slice of an array is a view into the same data, so modifying it\n", + "# will modify the original array.\n", + "print(a[0, 1]) # Prints \"2\"\n", + "b[0, 0] = 77 # b[0, 0] is the same piece of data as a[0, 1]\n", + "print(a[0, 1]) # Prints \"77\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also mix integer indexing with slice indexing. However, doing so will yield an array of lower rank than the original array" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[5 6 7 8] (4,)\n", + "[[5 6 7 8]] (1, 4)\n", + "[ 2 6 10] (3,)\n", + "[[ 2]\n", + " [ 6]\n", + " [10]] (3, 1)\n" + ] + } + ], + "source": [ + "# Create the following rank 2 array with shape (3, 4)\n", + "# [[ 1 2 3 4]\n", + "# [ 5 6 7 8]\n", + "# [ 9 10 11 12]]\n", + "a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])\n", + "\n", + "# Two ways of accessing the data in the middle row of the array.\n", + "# Mixing integer indexing with slices yields an array of lower rank,\n", + "# while using only slices yields an array of the same rank as the\n", + "# original array:\n", + "row_r1 = a[1, :] # Rank 1 view of the second row of a\n", + "row_r2 = a[1:2, :] # Rank 2 view of the second row of a\n", + "print(row_r1, row_r1.shape) # Prints \"[5 6 7 8] (4,)\"\n", + "print(row_r2, row_r2.shape) # Prints \"[[5 6 7 8]] (1, 4)\"\n", + "\n", + "# We can make the same distinction when accessing columns of an array:\n", + "col_r1 = a[:, 1]\n", + "col_r2 = a[:, 1:2]\n", + "print(col_r1, col_r1.shape) # Prints \"[ 2 6 10] (3,)\"\n", + "print(col_r2, col_r2.shape) # Prints \"[[ 2]\n", + " # [ 6]\n", + " # [10]] (3, 1)\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Integer array indexing**: When you index into numpy arrays using slicing, the resulting array view will always be a subarray of the original array. In contrast, integer array indexing allows you to construct arbitrary arrays using the data from another array. Here is an example:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1 4 5]\n", + "[1 4 5]\n", + "[2 2]\n", + "[2 2]\n" + ] + } + ], + "source": [ + "a = np.array([[1,2], [3, 4], [5, 6]])\n", + "\n", + "# An example of integer array indexing.\n", + "# The returned array will have shape (3,) and\n", + "print(a[[0, 1, 2], [0, 1, 0]]) # Prints \"[1 4 5]\"\n", + "\n", + "# The above example of integer array indexing is equivalent to this:\n", + "print(np.array([a[0, 0], a[1, 1], a[2, 0]])) # Prints \"[1 4 5]\"\n", + "\n", + "# When using integer array indexing, you can reuse the same\n", + "# element from the source array:\n", + "print(a[[0, 0], [1, 1]]) # Prints \"[2 2]\"\n", + "\n", + "# Equivalent to the previous integer array indexing example\n", + "print(np.array([a[0, 1], a[0, 1]])) # Prints \"[2 2]\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One useful trick with integer array indexing is selecting or mutating one element from each row of a matrix:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 1 2 3]\n", + " [ 4 5 6]\n", + " [ 7 8 9]\n", + " [10 11 12]]\n", + "[ 1 6 7 11]\n", + "[[11 2 3]\n", + " [ 4 5 16]\n", + " [17 8 9]\n", + " [10 21 12]]\n" + ] + } + ], + "source": [ + "# Create a new array from which we will select elements\n", + "a = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])\n", + "\n", + "print(a) # prints \"array([[ 1, 2, 3],\n", + " # [ 4, 5, 6],\n", + " # [ 7, 8, 9],\n", + " # [10, 11, 12]])\"\n", + "\n", + "# Create an array of indices\n", + "b = np.array([0, 2, 0, 1])\n", + "\n", + "# Select one element from each row of a using the indices in b\n", + "print(a[np.arange(4), b]) # Prints \"[ 1 6 7 11]\"\n", + "\n", + "# Mutate one element from each row of a using the indices in b\n", + "a[np.arange(4), b] += 10\n", + "\n", + "print(a) # prints \"array([[11, 2, 3],\n", + " # [ 4, 5, 16],\n", + " # [17, 8, 9],\n", + " # [10, 21, 12]])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Boolean array indexing**: Boolean array indexing lets you pick out arbitrary elements of an array. Frequently this type of indexing is used to select the elements of an array that satisfy some condition. Here is an example:" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[False False]\n", + " [ True True]\n", + " [ True True]]\n", + "[3 4 5 6]\n", + "[3 4 5 6]\n" + ] + } + ], + "source": [ + "a = np.array([[1,2], [3, 4], [5, 6]])\n", + "\n", + "bool_idx = (a > 2) # Find the elements of a that are bigger than 2;\n", + " # this returns a numpy array of Booleans of the same\n", + " # shape as a, where each slot of bool_idx tells\n", + " # whether that element of a is > 2.\n", + "\n", + "print(bool_idx) # Prints \"[[False False]\n", + " # [ True True]\n", + " # [ True True]]\"\n", + "\n", + "# We use boolean array indexing to construct a rank 1 array\n", + "# consisting of the elements of a corresponding to the True values\n", + "# of bool_idx\n", + "print(a[bool_idx]) # Prints \"[3 4 5 6]\"\n", + "\n", + "# We can do all of the above in a single concise statement:\n", + "print(a[a > 2]) # Prints \"[3 4 5 6]\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For brevity we have left out a lot of details about numpy array indexing; if you want to know more you should read the [documentation](http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Datatypes\n", + "Every numpy array is a grid of elements of the same type. Numpy provides a large set of numeric datatypes that you can use to construct arrays. Numpy tries to guess a datatype when you create an array, but functions that construct arrays usually also include an optional argument to explicitly specify the datatype. Here is an example:" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "int64\n", + "float64\n", + "int64\n" + ] + } + ], + "source": [ + "x = np.array([1, 2]) # Let numpy choose the datatype\n", + "print(x.dtype) # Prints \"int64\"\n", + "\n", + "x = np.array([1.0, 2.0]) # Let numpy choose the datatype\n", + "print(x.dtype) # Prints \"float64\"\n", + "\n", + "x = np.array([1, 2], dtype=np.int64) # Force a particular datatype\n", + "print(x.dtype) # Prints \"int64\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can read all about numpy datatypes in the [documentation](http://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Array math\n", + "Basic mathematical functions operate elementwise on arrays, and are available both as operator overloads and as functions in the numpy module:" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 6. 8.]\n", + " [10. 12.]]\n", + "[[ 6. 8.]\n", + " [10. 12.]]\n", + "[[-4. -4.]\n", + " [-4. -4.]]\n", + "[[-4. -4.]\n", + " [-4. -4.]]\n", + "[[ 5. 12.]\n", + " [21. 32.]]\n", + "[[ 5. 12.]\n", + " [21. 32.]]\n", + "[[0.2 0.33333333]\n", + " [0.42857143 0.5 ]]\n", + "[[0.2 0.33333333]\n", + " [0.42857143 0.5 ]]\n", + "[[1. 1.41421356]\n", + " [1.73205081 2. ]]\n" + ] + } + ], + "source": [ + "x = np.array([[1,2],[3,4]], dtype=np.float64)\n", + "y = np.array([[5,6],[7,8]], dtype=np.float64)\n", + "\n", + "# Elementwise sum; both produce the array\n", + "# [[ 6.0 8.0]\n", + "# [10.0 12.0]]\n", + "print(x + y)\n", + "print(np.add(x, y))\n", + "\n", + "# Elementwise difference; both produce the array\n", + "# [[-4.0 -4.0]\n", + "# [-4.0 -4.0]]\n", + "print(x - y)\n", + "print(np.subtract(x, y))\n", + "\n", + "# Elementwise product; both produce the array\n", + "# [[ 5.0 12.0]\n", + "# [21.0 32.0]]\n", + "print(x * y)\n", + "print(np.multiply(x, y))\n", + "\n", + "# Elementwise division; both produce the array\n", + "# [[ 0.2 0.33333333]\n", + "# [ 0.42857143 0.5 ]]\n", + "print(x / y)\n", + "print(np.divide(x, y))\n", + "\n", + "# Elementwise square root; produces the array\n", + "# [[ 1. 1.41421356]\n", + "# [ 1.73205081 2. ]]\n", + "print(np.sqrt(x))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that `*` is elementwise multiplication, not matrix multiplication. We instead use the `dot` function to compute inner products of vectors, to multiply a vector by a matrix, and to multiply matrices. `dot` is available both as a function in the numpy module and as an instance method of array objects:" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "219\n", + "219\n", + "[29 67]\n", + "[29 67]\n", + "[[19 22]\n", + " [43 50]]\n", + "[[19 22]\n", + " [43 50]]\n" + ] + } + ], + "source": [ + "x = np.array([[1,2],[3,4]])\n", + "y = np.array([[5,6],[7,8]])\n", + "\n", + "v = np.array([9,10])\n", + "w = np.array([11, 12])\n", + "\n", + "# Inner product of vectors; both produce 219\n", + "print(v.dot(w))\n", + "print(np.dot(v, w))\n", + "\n", + "# Matrix / vector product; both produce the rank 1 array [29 67]\n", + "print(x.dot(v))\n", + "print(np.dot(x, v))\n", + "\n", + "# Matrix / matrix product; both produce the rank 2 array\n", + "# [[19 22]\n", + "# [43 50]]\n", + "print(x.dot(y))\n", + "print(np.dot(x, y))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Numpy provides many useful functions for performing computations on arrays; one of the most useful is `sum`:" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10\n", + "[4 6]\n", + "[3 7]\n" + ] + } + ], + "source": [ + "x = np.array([[1,2],[3,4]])\n", + "\n", + "print(np.sum(x)) # Compute sum of all elements; prints \"10\"\n", + "print(np.sum(x, axis=0)) # Compute sum of each column; prints \"[4 6]\"\n", + "print(np.sum(x, axis=1)) # Compute sum of each row; prints \"[3 7]\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can find the full list of mathematical functions provided by numpy in the [documentation](http://docs.scipy.org/doc/numpy/reference/routines.math.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Apart from computing mathematical functions using arrays, we frequently need to reshape or otherwise manipulate data in arrays. The simplest example of this type of operation is transposing a matrix; to transpose a matrix, simply use the `T` attribute of an array object:" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1 2]\n", + " [3 4]]\n", + "[[1 3]\n", + " [2 4]]\n", + "[1 2 3]\n", + "[1 2 3]\n" + ] + } + ], + "source": [ + "x = np.array([[1,2], [3,4]])\n", + "print(x) # Prints \"[[1 2]\n", + " # [3 4]]\"\n", + "print(x.T) # Prints \"[[1 3]\n", + " # [2 4]]\"\n", + "\n", + "# Note that taking the transpose of a rank 1 array does nothing:\n", + "v = np.array([1,2,3])\n", + "print(v) # Prints \"[1 2 3]\"\n", + "print(v.T) # Prints \"[1 2 3]\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Numpy provides many more functions for manipulating arrays; you can see the full list in the [documentation](http://docs.scipy.org/doc/numpy/reference/routines.array-manipulation.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Numpy Documentation\n", + "This brief overview has touched on many of the important things that you need to know about numpy, but is far from complete. Check out the [numpy reference](http://docs.scipy.org/doc/numpy/reference/) to find out much more about numpy." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pandas\n", + "The pandas library provides high-performance, easy-to-use data structures and data analysis tools. The main data structure is the DataFrame, which you can think of as an in-memory 2D table (like a spreadsheet, with column names and row labels). Many features available in Excel are available programmatically, such as creating pivot tables, computing columns based on other columns, plotting graphs, etc. You can also group rows by column value, or join tables much like in SQL. Pandas is also great at handling time series.\n", + "\n", + "We will have a more detailed look at pandas in the second tutorial.\n", + "\n", + "## The Iris dataset\n", + "\n", + " For now, we will use pandas to inspect our first dataset, the famous [Iris dataset](https://archive.ics.uci.edu/ml/datasets/Iris).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " <th>class-label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>4.7</td>\n", + " <td>3.2</td>\n", + " <td>1.3</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>109</th>\n", + " <td>7.2</td>\n", + " <td>3.6</td>\n", + " <td>6.1</td>\n", + " <td>2.5</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " <tr>\n", + " <th>71</th>\n", + " <td>6.1</td>\n", + " <td>2.8</td>\n", + " <td>4.0</td>\n", + " <td>1.3</td>\n", + " <td>Iris-versicolor</td>\n", + " </tr>\n", + " <tr>\n", + " <th>144</th>\n", + " <td>6.7</td>\n", + " <td>3.3</td>\n", + " <td>5.7</td>\n", + " <td>2.5</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " <tr>\n", + " <th>128</th>\n", + " <td>6.4</td>\n", + " <td>2.8</td>\n", + " <td>5.6</td>\n", + " <td>2.1</td>\n", + " <td>Iris-virginica</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>5.0</td>\n", + " <td>3.4</td>\n", + " <td>1.6</td>\n", + " <td>0.4</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75</th>\n", + " <td>6.6</td>\n", + " <td>3.0</td>\n", + " <td>4.4</td>\n", + " <td>1.4</td>\n", + " <td>Iris-versicolor</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>4.8</td>\n", + " <td>3.0</td>\n", + " <td>1.4</td>\n", + " <td>0.1</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>86</th>\n", + " <td>6.7</td>\n", + " <td>3.1</td>\n", + " <td>4.7</td>\n", + " <td>1.5</td>\n", + " <td>Iris-versicolor</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>4.8</td>\n", + " <td>3.4</td>\n", + " <td>1.9</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width class-label\n", + "2 4.7 3.2 1.3 0.2 Iris-setosa\n", + "109 7.2 3.6 6.1 2.5 Iris-virginica\n", + "71 6.1 2.8 4.0 1.3 Iris-versicolor\n", + "144 6.7 3.3 5.7 2.5 Iris-virginica\n", + "128 6.4 2.8 5.6 2.1 Iris-virginica\n", + "26 5.0 3.4 1.6 0.4 Iris-setosa\n", + "75 6.6 3.0 4.4 1.4 Iris-versicolor\n", + "12 4.8 3.0 1.4 0.1 Iris-setosa\n", + "86 6.7 3.1 4.7 1.5 Iris-versicolor\n", + "24 4.8 3.4 1.9 0.2 Iris-setosa" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data\"\n", + "names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class-label']\n", + "dataset = pd.read_csv(url, names=names)\n", + "dataset.sample(10) # prints a ten random samples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`pandas` makes it easy to get an overview of the dataset and its statistics:" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 150 entries, 0 to 149\n", + "Data columns (total 5 columns):\n", + "sepal-length 150 non-null float64\n", + "sepal-width 150 non-null float64\n", + "petal-length 150 non-null float64\n", + "petal-width 150 non-null float64\n", + "class-label 150 non-null object\n", + "dtypes: float64(4), object(1)\n", + "memory usage: 5.9+ KB\n" + ] + } + ], + "source": [ + "dataset.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>150.000000</td>\n", + " <td>150.000000</td>\n", + " <td>150.000000</td>\n", + " <td>150.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>5.843333</td>\n", + " <td>3.054000</td>\n", + " <td>3.758667</td>\n", + " <td>1.198667</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>0.828066</td>\n", + " <td>0.433594</td>\n", + " <td>1.764420</td>\n", + " <td>0.763161</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>4.300000</td>\n", + " <td>2.000000</td>\n", + " <td>1.000000</td>\n", + " <td>0.100000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>5.100000</td>\n", + " <td>2.800000</td>\n", + " <td>1.600000</td>\n", + " <td>0.300000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>5.800000</td>\n", + " <td>3.000000</td>\n", + " <td>4.350000</td>\n", + " <td>1.300000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>6.400000</td>\n", + " <td>3.300000</td>\n", + " <td>5.100000</td>\n", + " <td>1.800000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>7.900000</td>\n", + " <td>4.400000</td>\n", + " <td>6.900000</td>\n", + " <td>2.500000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width\n", + "count 150.000000 150.000000 150.000000 150.000000\n", + "mean 5.843333 3.054000 3.758667 1.198667\n", + "std 0.828066 0.433594 1.764420 0.763161\n", + "min 4.300000 2.000000 1.000000 0.100000\n", + "25% 5.100000 2.800000 1.600000 0.300000\n", + "50% 5.800000 3.000000 4.350000 1.300000\n", + "75% 6.400000 3.300000 5.100000 1.800000\n", + "max 7.900000 4.400000 6.900000 2.500000" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also filter the dataset based on features or instances quite easily:" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 5.1\n", + "1 4.9\n", + "2 4.7\n", + "3 4.6\n", + "4 5.0\n", + "5 5.4\n", + "6 4.6\n", + "7 5.0\n", + "8 4.4\n", + "9 4.9\n", + "10 5.4\n", + "11 4.8\n", + "12 4.8\n", + "13 4.3\n", + "14 5.8\n", + "15 5.7\n", + "16 5.4\n", + "17 5.1\n", + "18 5.7\n", + "19 5.1\n", + "20 5.4\n", + "21 5.1\n", + "22 4.6\n", + "23 5.1\n", + "24 4.8\n", + "25 5.0\n", + "26 5.0\n", + "27 5.2\n", + "28 5.2\n", + "29 4.7\n", + " ... \n", + "120 6.9\n", + "121 5.6\n", + "122 7.7\n", + "123 6.3\n", + "124 6.7\n", + "125 7.2\n", + "126 6.2\n", + "127 6.1\n", + "128 6.4\n", + "129 7.2\n", + "130 7.4\n", + "131 7.9\n", + "132 6.4\n", + "133 6.3\n", + "134 6.1\n", + "135 7.7\n", + "136 6.3\n", + "137 6.4\n", + "138 6.0\n", + "139 6.9\n", + "140 6.7\n", + "141 6.9\n", + "142 5.8\n", + "143 6.8\n", + "144 6.7\n", + "145 6.7\n", + "146 6.3\n", + "147 6.5\n", + "148 6.2\n", + "149 5.9\n", + "Name: sepal-length, Length: 150, dtype: float64" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset['sepal-length'] # selects the 'sepal-length' feature column" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Iris-setosa\n", + "1 Iris-setosa\n", + "2 Iris-setosa\n", + "3 Iris-setosa\n", + "4 Iris-setosa\n", + "5 Iris-setosa\n", + "6 Iris-setosa\n", + "7 Iris-setosa\n", + "8 Iris-setosa\n", + "9 Iris-setosa\n", + "10 Iris-setosa\n", + "11 Iris-setosa\n", + "12 Iris-setosa\n", + "13 Iris-setosa\n", + "14 Iris-setosa\n", + "15 Iris-setosa\n", + "16 Iris-setosa\n", + "17 Iris-setosa\n", + "18 Iris-setosa\n", + "19 Iris-setosa\n", + "20 Iris-setosa\n", + "21 Iris-setosa\n", + "22 Iris-setosa\n", + "23 Iris-setosa\n", + "24 Iris-setosa\n", + "25 Iris-setosa\n", + "26 Iris-setosa\n", + "27 Iris-setosa\n", + "28 Iris-setosa\n", + "29 Iris-setosa\n", + " ... \n", + "120 Iris-virginica\n", + "121 Iris-virginica\n", + "122 Iris-virginica\n", + "123 Iris-virginica\n", + "124 Iris-virginica\n", + "125 Iris-virginica\n", + "126 Iris-virginica\n", + "127 Iris-virginica\n", + "128 Iris-virginica\n", + "129 Iris-virginica\n", + "130 Iris-virginica\n", + "131 Iris-virginica\n", + "132 Iris-virginica\n", + "133 Iris-virginica\n", + "134 Iris-virginica\n", + "135 Iris-virginica\n", + "136 Iris-virginica\n", + "137 Iris-virginica\n", + "138 Iris-virginica\n", + "139 Iris-virginica\n", + "140 Iris-virginica\n", + "141 Iris-virginica\n", + "142 Iris-virginica\n", + "143 Iris-virginica\n", + "144 Iris-virginica\n", + "145 Iris-virginica\n", + "146 Iris-virginica\n", + "147 Iris-virginica\n", + "148 Iris-virginica\n", + "149 Iris-virginica\n", + "Name: class-label, Length: 150, dtype: object" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset['class-label'] # selects the class-labels" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>sepal-length</th>\n", + " <th>sepal-width</th>\n", + " <th>petal-length</th>\n", + " <th>petal-width</th>\n", + " <th>class-label</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>5.1</td>\n", + " <td>3.5</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4.9</td>\n", + " <td>3.0</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>4.7</td>\n", + " <td>3.2</td>\n", + " <td>1.3</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4.6</td>\n", + " <td>3.1</td>\n", + " <td>1.5</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>5.0</td>\n", + " <td>3.6</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>5.4</td>\n", + " <td>3.9</td>\n", + " <td>1.7</td>\n", + " <td>0.4</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>4.6</td>\n", + " <td>3.4</td>\n", + " <td>1.4</td>\n", + " <td>0.3</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>5.0</td>\n", + " <td>3.4</td>\n", + " <td>1.5</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>4.4</td>\n", + " <td>2.9</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>4.9</td>\n", + " <td>3.1</td>\n", + " <td>1.5</td>\n", + " <td>0.1</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>5.4</td>\n", + " <td>3.7</td>\n", + " <td>1.5</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>4.8</td>\n", + " <td>3.4</td>\n", + " <td>1.6</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>4.8</td>\n", + " <td>3.0</td>\n", + " <td>1.4</td>\n", + " <td>0.1</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>4.3</td>\n", + " <td>3.0</td>\n", + " <td>1.1</td>\n", + " <td>0.1</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>5.8</td>\n", + " <td>4.0</td>\n", + " <td>1.2</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>5.7</td>\n", + " <td>4.4</td>\n", + " <td>1.5</td>\n", + " <td>0.4</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>5.4</td>\n", + " <td>3.9</td>\n", + " <td>1.3</td>\n", + " <td>0.4</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>5.1</td>\n", + " <td>3.5</td>\n", + " <td>1.4</td>\n", + " <td>0.3</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>5.7</td>\n", + " <td>3.8</td>\n", + " <td>1.7</td>\n", + " <td>0.3</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>5.1</td>\n", + " <td>3.8</td>\n", + " <td>1.5</td>\n", + " <td>0.3</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>5.4</td>\n", + " <td>3.4</td>\n", + " <td>1.7</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>21</th>\n", + " <td>5.1</td>\n", + " <td>3.7</td>\n", + " <td>1.5</td>\n", + " <td>0.4</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>22</th>\n", + " <td>4.6</td>\n", + " <td>3.6</td>\n", + " <td>1.0</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>23</th>\n", + " <td>5.1</td>\n", + " <td>3.3</td>\n", + " <td>1.7</td>\n", + " <td>0.5</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>24</th>\n", + " <td>4.8</td>\n", + " <td>3.4</td>\n", + " <td>1.9</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25</th>\n", + " <td>5.0</td>\n", + " <td>3.0</td>\n", + " <td>1.6</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>5.0</td>\n", + " <td>3.4</td>\n", + " <td>1.6</td>\n", + " <td>0.4</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>5.2</td>\n", + " <td>3.5</td>\n", + " <td>1.5</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>28</th>\n", + " <td>5.2</td>\n", + " <td>3.4</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>29</th>\n", + " <td>4.7</td>\n", + " <td>3.2</td>\n", + " <td>1.6</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>30</th>\n", + " <td>4.8</td>\n", + " <td>3.1</td>\n", + " <td>1.6</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>31</th>\n", + " <td>5.4</td>\n", + " <td>3.4</td>\n", + " <td>1.5</td>\n", + " <td>0.4</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>32</th>\n", + " <td>5.2</td>\n", + " <td>4.1</td>\n", + " <td>1.5</td>\n", + " <td>0.1</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>33</th>\n", + " <td>5.5</td>\n", + " <td>4.2</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>34</th>\n", + " <td>4.9</td>\n", + " <td>3.1</td>\n", + " <td>1.5</td>\n", + " <td>0.1</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>35</th>\n", + " <td>5.0</td>\n", + " <td>3.2</td>\n", + " <td>1.2</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>36</th>\n", + " <td>5.5</td>\n", + " <td>3.5</td>\n", + " <td>1.3</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>37</th>\n", + " <td>4.9</td>\n", + " <td>3.1</td>\n", + " <td>1.5</td>\n", + " <td>0.1</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>38</th>\n", + " <td>4.4</td>\n", + " <td>3.0</td>\n", + " <td>1.3</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>39</th>\n", + " <td>5.1</td>\n", + " <td>3.4</td>\n", + " <td>1.5</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>40</th>\n", + " <td>5.0</td>\n", + " <td>3.5</td>\n", + " <td>1.3</td>\n", + " <td>0.3</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>41</th>\n", + " <td>4.5</td>\n", + " <td>2.3</td>\n", + " <td>1.3</td>\n", + " <td>0.3</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>42</th>\n", + " <td>4.4</td>\n", + " <td>3.2</td>\n", + " <td>1.3</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>43</th>\n", + " <td>5.0</td>\n", + " <td>3.5</td>\n", + " <td>1.6</td>\n", + " <td>0.6</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>44</th>\n", + " <td>5.1</td>\n", + " <td>3.8</td>\n", + " <td>1.9</td>\n", + " <td>0.4</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>45</th>\n", + " <td>4.8</td>\n", + " <td>3.0</td>\n", + " <td>1.4</td>\n", + " <td>0.3</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>46</th>\n", + " <td>5.1</td>\n", + " <td>3.8</td>\n", + " <td>1.6</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>47</th>\n", + " <td>4.6</td>\n", + " <td>3.2</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>48</th>\n", + " <td>5.3</td>\n", + " <td>3.7</td>\n", + " <td>1.5</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " <tr>\n", + " <th>49</th>\n", + " <td>5.0</td>\n", + " <td>3.3</td>\n", + " <td>1.4</td>\n", + " <td>0.2</td>\n", + " <td>Iris-setosa</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " sepal-length sepal-width petal-length petal-width class-label\n", + "0 5.1 3.5 1.4 0.2 Iris-setosa\n", + "1 4.9 3.0 1.4 0.2 Iris-setosa\n", + "2 4.7 3.2 1.3 0.2 Iris-setosa\n", + "3 4.6 3.1 1.5 0.2 Iris-setosa\n", + "4 5.0 3.6 1.4 0.2 Iris-setosa\n", + "5 5.4 3.9 1.7 0.4 Iris-setosa\n", + "6 4.6 3.4 1.4 0.3 Iris-setosa\n", + "7 5.0 3.4 1.5 0.2 Iris-setosa\n", + "8 4.4 2.9 1.4 0.2 Iris-setosa\n", + "9 4.9 3.1 1.5 0.1 Iris-setosa\n", + "10 5.4 3.7 1.5 0.2 Iris-setosa\n", + "11 4.8 3.4 1.6 0.2 Iris-setosa\n", + "12 4.8 3.0 1.4 0.1 Iris-setosa\n", + "13 4.3 3.0 1.1 0.1 Iris-setosa\n", + "14 5.8 4.0 1.2 0.2 Iris-setosa\n", + "15 5.7 4.4 1.5 0.4 Iris-setosa\n", + "16 5.4 3.9 1.3 0.4 Iris-setosa\n", + "17 5.1 3.5 1.4 0.3 Iris-setosa\n", + "18 5.7 3.8 1.7 0.3 Iris-setosa\n", + "19 5.1 3.8 1.5 0.3 Iris-setosa\n", + "20 5.4 3.4 1.7 0.2 Iris-setosa\n", + "21 5.1 3.7 1.5 0.4 Iris-setosa\n", + "22 4.6 3.6 1.0 0.2 Iris-setosa\n", + "23 5.1 3.3 1.7 0.5 Iris-setosa\n", + "24 4.8 3.4 1.9 0.2 Iris-setosa\n", + "25 5.0 3.0 1.6 0.2 Iris-setosa\n", + "26 5.0 3.4 1.6 0.4 Iris-setosa\n", + "27 5.2 3.5 1.5 0.2 Iris-setosa\n", + "28 5.2 3.4 1.4 0.2 Iris-setosa\n", + "29 4.7 3.2 1.6 0.2 Iris-setosa\n", + "30 4.8 3.1 1.6 0.2 Iris-setosa\n", + "31 5.4 3.4 1.5 0.4 Iris-setosa\n", + "32 5.2 4.1 1.5 0.1 Iris-setosa\n", + "33 5.5 4.2 1.4 0.2 Iris-setosa\n", + "34 4.9 3.1 1.5 0.1 Iris-setosa\n", + "35 5.0 3.2 1.2 0.2 Iris-setosa\n", + "36 5.5 3.5 1.3 0.2 Iris-setosa\n", + "37 4.9 3.1 1.5 0.1 Iris-setosa\n", + "38 4.4 3.0 1.3 0.2 Iris-setosa\n", + "39 5.1 3.4 1.5 0.2 Iris-setosa\n", + "40 5.0 3.5 1.3 0.3 Iris-setosa\n", + "41 4.5 2.3 1.3 0.3 Iris-setosa\n", + "42 4.4 3.2 1.3 0.2 Iris-setosa\n", + "43 5.0 3.5 1.6 0.6 Iris-setosa\n", + "44 5.1 3.8 1.9 0.4 Iris-setosa\n", + "45 4.8 3.0 1.4 0.3 Iris-setosa\n", + "46 5.1 3.8 1.6 0.2 Iris-setosa\n", + "47 4.6 3.2 1.4 0.2 Iris-setosa\n", + "48 5.3 3.7 1.5 0.2 Iris-setosa\n", + "49 5.0 3.3 1.4 0.2 Iris-setosa" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset[dataset['class-label'] == 'Iris-setosa'] # filters the dataset based on a specific class label" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next week\n", + "We will cover more data loading, selection, manipulation, exploration and visualization next week." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} -- GitLab