From 02368917cd906cb57eac3c3f0df2055c68da0b3b Mon Sep 17 00:00:00 2001 From: aakan96 <aakan96@mi.fu-berlin.de> Date: Tue, 11 Jul 2023 18:55:00 +0000 Subject: [PATCH] Neue Datei hochladen --- .../DS_mRNA_limma_dataset_svm_F.ipynb | 1866 +++++++++++++++++ 1 file changed, 1866 insertions(+) create mode 100644 Machine Learning/DS_mRNA_limma_dataset_svm_F.ipynb diff --git a/Machine Learning/DS_mRNA_limma_dataset_svm_F.ipynb b/Machine Learning/DS_mRNA_limma_dataset_svm_F.ipynb new file mode 100644 index 0000000..3428ccc --- /dev/null +++ b/Machine Learning/DS_mRNA_limma_dataset_svm_F.ipynb @@ -0,0 +1,1866 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 113, + "id": "f097ad55", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "#from sklearn.model_selection import cross_val_score\n", + "#from sklearn.metrics import accuracy_score\n", + "#import sklearn.metrics as metrics\n", + "#from sklearn.metrics import auc\n", + "from sklearn.metrics import RocCurveDisplay\n", + "#from sklearn.model_selection import KFold\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from imblearn.over_sampling import SMOTE\n", + "from sklearn.linear_model import Lasso\n", + "import xgboost as xgb\n", + "from sklearn.model_selection import GridSearchCV\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "np.random.seed(7)" + ] + }, + { + "cell_type": "markdown", + "id": "73b6611a", + "metadata": {}, + "source": [ + "# Data Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "id": "0eeb7a35", + "metadata": {}, + "outputs": [], + "source": [ + "df_train = pd.read_csv(\"DS/mRNA_DS_preprocessed_training_data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "id": "a6ab23aa", + "metadata": {}, + "outputs": [], + "source": [ + "df_test = pd.read_csv(\"DS/mRNA_DS_test_data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "id": "683b63ce", + "metadata": {}, + "outputs": [], + "source": [ + "df_train = df_train.T\n", + "df_test = df_test.T" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "7928107a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>0</th>\n", + " <th>1</th>\n", + " <th>2</th>\n", + " <th>3</th>\n", + " <th>4</th>\n", + " <th>5</th>\n", + " <th>6</th>\n", + " <th>7</th>\n", + " <th>8</th>\n", + " <th>9</th>\n", + " <th>...</th>\n", + " <th>4847</th>\n", + " <th>4848</th>\n", + " <th>4849</th>\n", + " <th>4850</th>\n", + " <th>4851</th>\n", + " <th>4852</th>\n", + " <th>4853</th>\n", + " <th>4854</th>\n", + " <th>4855</th>\n", + " <th>4856</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>GSM935144</th>\n", + " <td>8.369057</td>\n", + " <td>6.512733</td>\n", + " <td>7.566895</td>\n", + " <td>8.583529</td>\n", + " <td>3.89921</td>\n", + " <td>7.500077</td>\n", + " <td>5.286338</td>\n", + " <td>8.024788</td>\n", + " <td>10.163873</td>\n", + " <td>10.411274</td>\n", + " <td>...</td>\n", + " <td>8.397834</td>\n", + " <td>6.495765</td>\n", + " <td>7.682125</td>\n", + " <td>5.636567</td>\n", + " <td>4.662533</td>\n", + " <td>4.581447</td>\n", + " <td>4.913772</td>\n", + " <td>7.314994</td>\n", + " <td>4.946271</td>\n", + " <td>5.824582</td>\n", + " </tr>\n", + " <tr>\n", + " <th>GSM935145</th>\n", + " <td>10.480503</td>\n", + " <td>7.291269</td>\n", + " <td>7.39136</td>\n", + " <td>6.463924</td>\n", + " <td>5.363036</td>\n", + " <td>9.041782</td>\n", + " <td>6.814046</td>\n", + " <td>7.995978</td>\n", + " <td>10.645666</td>\n", + " <td>10.322436</td>\n", + " <td>...</td>\n", + " <td>8.307262</td>\n", + " <td>5.998426</td>\n", + " <td>7.959026</td>\n", + " <td>6.265455</td>\n", + " <td>6.564568</td>\n", + " <td>4.97292</td>\n", + " <td>5.502765</td>\n", + " <td>7.732989</td>\n", + " <td>7.491779</td>\n", + " <td>6.065943</td>\n", + " </tr>\n", + " <tr>\n", + " <th>GSM935146</th>\n", + " <td>8.142284</td>\n", + " <td>6.62082</td>\n", + " <td>7.180874</td>\n", + " <td>7.354349</td>\n", + " <td>3.835406</td>\n", + " <td>7.817166</td>\n", + " <td>5.582456</td>\n", + " <td>7.851524</td>\n", + " <td>10.104783</td>\n", + " <td>10.148618</td>\n", + " <td>...</td>\n", + " <td>8.169248</td>\n", + " <td>6.025941</td>\n", + " <td>7.20579</td>\n", + " <td>5.964245</td>\n", + " <td>4.989752</td>\n", + " <td>3.925917</td>\n", + " <td>4.362655</td>\n", + " <td>7.430351</td>\n", + " <td>7.569171</td>\n", + " <td>5.600952</td>\n", + " </tr>\n", + " <tr>\n", + " <th>GSM935147</th>\n", + " <td>11.111467</td>\n", + " <td>7.836151</td>\n", + " <td>6.825569</td>\n", + " <td>6.778957</td>\n", + " <td>5.571634</td>\n", + " <td>9.353702</td>\n", + " <td>6.078075</td>\n", + " <td>7.895813</td>\n", + " <td>11.05969</td>\n", + " <td>10.465087</td>\n", + " <td>...</td>\n", + " <td>8.391617</td>\n", + " <td>6.466206</td>\n", + " <td>8.108325</td>\n", + " <td>6.823888</td>\n", + " <td>6.039951</td>\n", + " <td>4.626001</td>\n", + " <td>6.16125</td>\n", + " <td>7.9986</td>\n", + " <td>8.799876</td>\n", + " <td>6.02798</td>\n", + " </tr>\n", + " <tr>\n", + " <th>GSM935148</th>\n", + " <td>8.34986</td>\n", + " <td>6.783343</td>\n", + " <td>7.110082</td>\n", + " <td>7.492759</td>\n", + " <td>3.77593</td>\n", + " <td>7.289664</td>\n", + " <td>5.335561</td>\n", + " <td>7.943047</td>\n", + " <td>10.453795</td>\n", + " <td>10.084535</td>\n", + " <td>...</td>\n", + " <td>8.699464</td>\n", + " <td>6.098069</td>\n", + " <td>7.788782</td>\n", + " <td>5.897471</td>\n", + " <td>4.804696</td>\n", + " <td>4.15752</td>\n", + " <td>4.363255</td>\n", + " <td>7.179146</td>\n", + " <td>6.736683</td>\n", + " <td>5.498095</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>GSM935200</th>\n", + " <td>10.006763</td>\n", + " <td>6.303831</td>\n", + " <td>8.050464</td>\n", + " <td>7.210553</td>\n", + " <td>4.417198</td>\n", + " <td>7.079421</td>\n", + " <td>6.583233</td>\n", + " <td>8.350974</td>\n", + " <td>10.814444</td>\n", + " <td>9.84037</td>\n", + " <td>...</td>\n", + " <td>7.525502</td>\n", + " <td>5.588707</td>\n", + " <td>7.105646</td>\n", + " <td>7.142224</td>\n", + " <td>5.614111</td>\n", + " <td>3.709397</td>\n", + " <td>3.954994</td>\n", + " <td>6.490069</td>\n", + " <td>4.385806</td>\n", + " <td>5.972305</td>\n", + " </tr>\n", + " <tr>\n", + " <th>GSM935201</th>\n", + " <td>10.409206</td>\n", + " <td>6.753581</td>\n", + " <td>8.037541</td>\n", + " <td>7.197887</td>\n", + " <td>4.055054</td>\n", + " <td>7.446122</td>\n", + " <td>7.284751</td>\n", + " <td>8.205061</td>\n", + " <td>9.81193</td>\n", + " <td>10.075071</td>\n", + " <td>...</td>\n", + " <td>8.209273</td>\n", + " <td>5.922377</td>\n", + " <td>7.892188</td>\n", + " <td>6.476743</td>\n", + " <td>5.870292</td>\n", + " <td>4.874964</td>\n", + " <td>4.536575</td>\n", + " <td>6.740481</td>\n", + " <td>4.728352</td>\n", + " <td>6.589286</td>\n", + " </tr>\n", + " <tr>\n", + " <th>GSM935202</th>\n", + " <td>10.959619</td>\n", + " <td>6.719885</td>\n", + " <td>8.171633</td>\n", + " <td>7.612934</td>\n", + " <td>4.459969</td>\n", + " <td>7.209145</td>\n", + " <td>7.08157</td>\n", + " <td>8.402631</td>\n", + " <td>11.421799</td>\n", + " <td>10.136597</td>\n", + " <td>...</td>\n", + " <td>6.972532</td>\n", + " <td>6.10597</td>\n", + " <td>7.300605</td>\n", + " <td>7.098034</td>\n", + " <td>5.027756</td>\n", + " <td>3.903551</td>\n", + " <td>4.801002</td>\n", + " <td>7.021587</td>\n", + " <td>5.007519</td>\n", + " <td>6.399793</td>\n", + " </tr>\n", + " <tr>\n", + " <th>GSM935203</th>\n", + " <td>10.68054</td>\n", + " <td>7.447833</td>\n", + " <td>8.118057</td>\n", + " <td>6.53307</td>\n", + " <td>3.952315</td>\n", + " <td>7.598421</td>\n", + " <td>6.257321</td>\n", + " <td>8.258537</td>\n", + " <td>10.437635</td>\n", + " <td>10.576629</td>\n", + " <td>...</td>\n", + " <td>8.648412</td>\n", + " <td>6.04367</td>\n", + " <td>7.982976</td>\n", + " <td>5.885975</td>\n", + " <td>6.998636</td>\n", + " <td>4.572075</td>\n", + " <td>5.612742</td>\n", + " <td>7.441464</td>\n", + " <td>4.96548</td>\n", + " <td>6.258775</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Gene symbol</th>\n", + " <td>MIR4640///DDR1</td>\n", + " <td>RFC2</td>\n", + " <td>PAX8</td>\n", + " <td>MIR5193///UBA7</td>\n", + " <td>CYP2E1</td>\n", + " <td>MMP14</td>\n", + " <td>PLD1</td>\n", + " <td>DTX2P1-UPK3BP1-PMS2P11</td>\n", + " <td>CAPNS1</td>\n", + " <td>HNRNPC</td>\n", + " <td>...</td>\n", + " <td>C11orf24</td>\n", + " <td>B4GALT7</td>\n", + " <td>DVL2</td>\n", + " <td>RBKS</td>\n", + " <td>SENP5</td>\n", + " <td>POLR2J4</td>\n", + " <td>INO80B-WBP1///INO80B</td>\n", + " <td>SNHG17</td>\n", + " <td>MEX3D</td>\n", + " <td>DCAF15</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>61 rows × 4857 columns</p>\n", + "</div>" + ], + "text/plain": [ + " 0 1 2 3 4 \\\n", + "GSM935144 8.369057 6.512733 7.566895 8.583529 3.89921 \n", + "GSM935145 10.480503 7.291269 7.39136 6.463924 5.363036 \n", + "GSM935146 8.142284 6.62082 7.180874 7.354349 3.835406 \n", + "GSM935147 11.111467 7.836151 6.825569 6.778957 5.571634 \n", + "GSM935148 8.34986 6.783343 7.110082 7.492759 3.77593 \n", + "... ... ... ... ... ... \n", + "GSM935200 10.006763 6.303831 8.050464 7.210553 4.417198 \n", + "GSM935201 10.409206 6.753581 8.037541 7.197887 4.055054 \n", + "GSM935202 10.959619 6.719885 8.171633 7.612934 4.459969 \n", + "GSM935203 10.68054 7.447833 8.118057 6.53307 3.952315 \n", + "Gene symbol MIR4640///DDR1 RFC2 PAX8 MIR5193///UBA7 CYP2E1 \n", + "\n", + " 5 6 7 8 9 \\\n", + "GSM935144 7.500077 5.286338 8.024788 10.163873 10.411274 \n", + "GSM935145 9.041782 6.814046 7.995978 10.645666 10.322436 \n", + "GSM935146 7.817166 5.582456 7.851524 10.104783 10.148618 \n", + "GSM935147 9.353702 6.078075 7.895813 11.05969 10.465087 \n", + "GSM935148 7.289664 5.335561 7.943047 10.453795 10.084535 \n", + "... ... ... ... ... ... \n", + "GSM935200 7.079421 6.583233 8.350974 10.814444 9.84037 \n", + "GSM935201 7.446122 7.284751 8.205061 9.81193 10.075071 \n", + "GSM935202 7.209145 7.08157 8.402631 11.421799 10.136597 \n", + "GSM935203 7.598421 6.257321 8.258537 10.437635 10.576629 \n", + "Gene symbol MMP14 PLD1 DTX2P1-UPK3BP1-PMS2P11 CAPNS1 HNRNPC \n", + "\n", + " ... 4847 4848 4849 4850 4851 4852 \\\n", + "GSM935144 ... 8.397834 6.495765 7.682125 5.636567 4.662533 4.581447 \n", + "GSM935145 ... 8.307262 5.998426 7.959026 6.265455 6.564568 4.97292 \n", + "GSM935146 ... 8.169248 6.025941 7.20579 5.964245 4.989752 3.925917 \n", + "GSM935147 ... 8.391617 6.466206 8.108325 6.823888 6.039951 4.626001 \n", + "GSM935148 ... 8.699464 6.098069 7.788782 5.897471 4.804696 4.15752 \n", + "... ... ... ... ... ... ... ... \n", + "GSM935200 ... 7.525502 5.588707 7.105646 7.142224 5.614111 3.709397 \n", + "GSM935201 ... 8.209273 5.922377 7.892188 6.476743 5.870292 4.874964 \n", + "GSM935202 ... 6.972532 6.10597 7.300605 7.098034 5.027756 3.903551 \n", + "GSM935203 ... 8.648412 6.04367 7.982976 5.885975 6.998636 4.572075 \n", + "Gene symbol ... C11orf24 B4GALT7 DVL2 RBKS SENP5 POLR2J4 \n", + "\n", + " 4853 4854 4855 4856 \n", + "GSM935144 4.913772 7.314994 4.946271 5.824582 \n", + "GSM935145 5.502765 7.732989 7.491779 6.065943 \n", + "GSM935146 4.362655 7.430351 7.569171 5.600952 \n", + "GSM935147 6.16125 7.9986 8.799876 6.02798 \n", + "GSM935148 4.363255 7.179146 6.736683 5.498095 \n", + "... ... ... ... ... \n", + "GSM935200 3.954994 6.490069 4.385806 5.972305 \n", + "GSM935201 4.536575 6.740481 4.728352 6.589286 \n", + "GSM935202 4.801002 7.021587 5.007519 6.399793 \n", + "GSM935203 5.612742 7.441464 4.96548 6.258775 \n", + "Gene symbol INO80B-WBP1///INO80B SNHG17 MEX3D DCAF15 \n", + "\n", + "[61 rows x 4857 columns]" + ] + }, + "execution_count": 128, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_test" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "a701e30d", + "metadata": {}, + "outputs": [], + "source": [ + "#df_test = df_test[:-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "77d974be", + "metadata": {}, + "outputs": [], + "source": [ + "#Transform the input data\n", + "df_train.rename(columns=df_train.iloc[0], inplace = True)\n", + "df_train.drop(df_train.index[0], inplace = True)\n", + "df_train=df_train.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "id": "2e78017d", + "metadata": {}, + "outputs": [], + "source": [ + "#Transform the input data\n", + "df_test.rename(columns=df_test.iloc[-1], inplace = True)\n", + "df_test.drop(df_test.index[-1], inplace = True)\n", + "df_test=df_test.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "ea60801d", + "metadata": {}, + "outputs": [], + "source": [ + "metadata_test = pd.read_csv(\"DS/mRNA_DS_metadata_col_test_info.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "id": "58d531b9", + "metadata": {}, + "outputs": [], + "source": [ + "df_test= df_test.merge(metadata_test, left_on=\"index\", right_on= \"Unnamed: 0\")" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "7910f2fa", + "metadata": {}, + "outputs": [], + "source": [ + "df_test['title0'] = df_test['title0'].replace('(?i)mucosa|normal|healthy', 0, regex=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "id": "c102e10e", + "metadata": {}, + "outputs": [], + "source": [ + "df_test['title0'] = df_test['title0'].replace('(?i)Tumor|Cancer|carcinoma', 1, regex=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "id": "6c255d2e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "title0\n", + "0 30\n", + "1 30\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_test['title0'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "636b44ab", + "metadata": {}, + "outputs": [], + "source": [ + "df_test = df_test[pd.to_numeric(df_test['title0'], errors='coerce').notnull()]#remove all non-numeric data from the column." + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "34896f9a", + "metadata": {}, + "outputs": [], + "source": [ + "df_test= df_test.drop(['index', 'Unnamed: 0'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "359b5bab", + "metadata": {}, + "outputs": [], + "source": [ + "df_test= df_test.rename(columns={\"title0\": \"index\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "dc14bb1c", + "metadata": {}, + "outputs": [], + "source": [ + "X_test=df_test.drop(\"index\",axis=1)\n", + "y_test=df_test['index']" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "4c50c510", + "metadata": {}, + "outputs": [], + "source": [ + "metadata_train = pd.read_csv(\"DS/mRNA_DS_metadata_col_info.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "6730cf89", + "metadata": {}, + "outputs": [], + "source": [ + "df_train= df_train.merge(metadata_train, left_on=\"index\", right_on= \"Unnamed: 0\")" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "7a8ad8ad", + "metadata": {}, + "outputs": [], + "source": [ + "df_train['title0'] = df_train['title0'].replace('(?i)mucosa|normal|healthy', 0, regex=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "a8cf8643", + "metadata": {}, + "outputs": [], + "source": [ + "df_train['title0'] = df_train['title0'].replace('(?i)Tumor|Cancer|carcinoma', 1, regex=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "c9e8772b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "title0\n", + "0 111\n", + "1 108\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train['title0'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "f5d203aa", + "metadata": {}, + "outputs": [], + "source": [ + "df_train = df_train[pd.to_numeric(df_train['title0'], errors='coerce').notnull()]#remove all non-numeric data from the column." + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "523bdaa6", + "metadata": {}, + "outputs": [], + "source": [ + "df_train= df_train.drop(['index', 'Unnamed: 0'], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "46a6fb36", + "metadata": {}, + "outputs": [], + "source": [ + "df_train= df_train.rename(columns={\"title0\": \"index\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "e26f88c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "index\n", + "0 111\n", + "1 108\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train['index'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "fbaf2507", + "metadata": {}, + "outputs": [], + "source": [ + "df_train= df_train.apply(pd.to_numeric)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "38a993d9", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.manifold import TSNE\n", + "# Assuming your data is stored in the variable 'data'\n", + "tsne = TSNE(n_components=3)\n", + "embedded_data = tsne.fit_transform(df_train)\n", + "\n", + "# Step 2: Separate data points by class\n", + "class_1_indices = np.where(df_train['index'] == 0)[0]\n", + "class_2_indices = np.where(df_train['index'] == 1)[0]\n", + "\n", + "class_1_data = embedded_data[class_1_indices]\n", + "class_2_data = embedded_data[class_2_indices]\n", + "\n", + "# Step 3: Plot the t-SNE plot with different colors for each class\n", + "plt.scatter(class_1_data[:, 0], class_1_data[:, 1], color='red', label='Non-Cancer')\n", + "plt.scatter(class_2_data[:, 0], class_2_data[:, 1], color='blue', label='Cancer')\n", + "\n", + "plt.title('t-SNE Plot')\n", + "plt.xlabel('Dimension 1')\n", + "plt.ylabel('Dimension 2')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "776cfbee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "index\n", + "0 111\n", + "1 108\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train['index'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "8c0011ea", + "metadata": {}, + "outputs": [], + "source": [ + "X=df_train.drop(\"index\",axis=1)\n", + "y=df_train['index']" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "fc606979", + "metadata": {}, + "outputs": [], + "source": [ + "X=X.astype('int')" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "93e28118", + "metadata": {}, + "outputs": [], + "source": [ + "y=y.astype('int')" + ] + }, + { + "cell_type": "markdown", + "id": "e9830b6c", + "metadata": {}, + "source": [ + "# Feature Selection" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "f0f1977f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ABAT</th>\n", + " <th>ABHD5</th>\n", + " <th>ABLIM1</th>\n", + " <th>ABLIM3</th>\n", + " <th>ACAA1</th>\n", + " <th>ACADM</th>\n", + " <th>ACADVL</th>\n", + " <th>ACD</th>\n", + " <th>ACLY</th>\n", + " <th>ACOT11</th>\n", + " <th>...</th>\n", + " <th>XYLT1</th>\n", + " <th>YOD1</th>\n", + " <th>YTHDC1</th>\n", + " <th>ZBTB16</th>\n", + " <th>ZDHHC13</th>\n", + " <th>ZFP64</th>\n", + " <th>ZNF185</th>\n", + " <th>ZNF365</th>\n", + " <th>ZNF426</th>\n", + " <th>ZNF710</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>186</td>\n", + " <td>2603</td>\n", + " <td>42653</td>\n", + " <td>220</td>\n", + " <td>2132</td>\n", + " <td>22869</td>\n", + " <td>19775</td>\n", + " <td>4486</td>\n", + " <td>8835</td>\n", + " <td>2332</td>\n", + " <td>...</td>\n", + " <td>392</td>\n", + " <td>222</td>\n", + " <td>295</td>\n", + " <td>4598</td>\n", + " <td>7009</td>\n", + " <td>568</td>\n", + " <td>65123</td>\n", + " <td>56</td>\n", + " <td>308</td>\n", + " <td>10385</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>93</td>\n", + " <td>1137</td>\n", + " <td>16493</td>\n", + " <td>69</td>\n", + " <td>1816</td>\n", + " <td>17788</td>\n", + " <td>16870</td>\n", + " <td>7993</td>\n", + " <td>21434</td>\n", + " <td>2211</td>\n", + " <td>...</td>\n", + " <td>62</td>\n", + " <td>78</td>\n", + " <td>144</td>\n", + " <td>2132</td>\n", + " <td>2602</td>\n", + " <td>1720</td>\n", + " <td>13531</td>\n", + " <td>47</td>\n", + " <td>140</td>\n", + " <td>6441</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>198</td>\n", + " <td>5593</td>\n", + " <td>53918</td>\n", + " <td>263</td>\n", + " <td>3490</td>\n", + " <td>39276</td>\n", + " <td>25847</td>\n", + " <td>4413</td>\n", + " <td>9212</td>\n", + " <td>7419</td>\n", + " <td>...</td>\n", + " <td>481</td>\n", + " <td>355</td>\n", + " <td>308</td>\n", + " <td>1071</td>\n", + " <td>10289</td>\n", + " <td>379</td>\n", + " <td>65131</td>\n", + " <td>206</td>\n", + " <td>1251</td>\n", + " <td>11768</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>104</td>\n", + " <td>1636</td>\n", + " <td>19203</td>\n", + " <td>127</td>\n", + " <td>1518</td>\n", + " <td>17951</td>\n", + " <td>16854</td>\n", + " <td>12800</td>\n", + " <td>11939</td>\n", + " <td>5136</td>\n", + " <td>...</td>\n", + " <td>213</td>\n", + " <td>122</td>\n", + " <td>244</td>\n", + " <td>482</td>\n", + " <td>3578</td>\n", + " <td>1990</td>\n", + " <td>37715</td>\n", + " <td>66</td>\n", + " <td>361</td>\n", + " <td>8517</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>205</td>\n", + " <td>4720</td>\n", + " <td>56984</td>\n", + " <td>495</td>\n", + " <td>3309</td>\n", + " <td>24427</td>\n", + " <td>28197</td>\n", + " <td>5718</td>\n", + " <td>8192</td>\n", + " <td>6748</td>\n", + " <td>...</td>\n", + " <td>169</td>\n", + " <td>275</td>\n", + " <td>200</td>\n", + " <td>3632</td>\n", + " <td>7275</td>\n", + " <td>509</td>\n", + " <td>65138</td>\n", + " <td>188</td>\n", + " <td>587</td>\n", + " <td>9390</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>214</th>\n", + " <td>4</td>\n", + " <td>5</td>\n", + " <td>9</td>\n", + " <td>6</td>\n", + " <td>7</td>\n", + " <td>7</td>\n", + " <td>10</td>\n", + " <td>8</td>\n", + " <td>7</td>\n", + " <td>7</td>\n", + " <td>...</td>\n", + " <td>7</td>\n", + " <td>4</td>\n", + " <td>9</td>\n", + " <td>6</td>\n", + " <td>7</td>\n", + " <td>7</td>\n", + " <td>7</td>\n", + " <td>6</td>\n", + " <td>5</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>215</th>\n", + " <td>5</td>\n", + " <td>5</td>\n", + " <td>8</td>\n", + " <td>8</td>\n", + " <td>8</td>\n", + " <td>8</td>\n", + " <td>9</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>7</td>\n", + " <td>...</td>\n", + " <td>7</td>\n", + " <td>4</td>\n", + " <td>9</td>\n", + " <td>6</td>\n", + " <td>6</td>\n", + " <td>7</td>\n", + " <td>9</td>\n", + " <td>6</td>\n", + " <td>5</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>216</th>\n", + " <td>4</td>\n", + " <td>5</td>\n", + " <td>9</td>\n", + " <td>6</td>\n", + " <td>7</td>\n", + " <td>8</td>\n", + " <td>9</td>\n", + " <td>8</td>\n", + " <td>7</td>\n", + " <td>7</td>\n", + " <td>...</td>\n", + " <td>7</td>\n", + " <td>5</td>\n", + " <td>9</td>\n", + " <td>6</td>\n", + " <td>6</td>\n", + " <td>7</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>5</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>217</th>\n", + " <td>5</td>\n", + " <td>4</td>\n", + " <td>9</td>\n", + " <td>7</td>\n", + " <td>8</td>\n", + " <td>8</td>\n", + " <td>9</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>7</td>\n", + " <td>...</td>\n", + " <td>7</td>\n", + " <td>4</td>\n", + " <td>10</td>\n", + " <td>6</td>\n", + " <td>8</td>\n", + " <td>7</td>\n", + " <td>8</td>\n", + " <td>6</td>\n", + " <td>5</td>\n", + " <td>8</td>\n", + " </tr>\n", + " <tr>\n", + " <th>218</th>\n", + " <td>4</td>\n", + " <td>4</td>\n", + " <td>7</td>\n", + " <td>6</td>\n", + " <td>8</td>\n", + " <td>8</td>\n", + " <td>10</td>\n", + " <td>8</td>\n", + " <td>7</td>\n", + " <td>7</td>\n", + " <td>...</td>\n", + " <td>6</td>\n", + " <td>5</td>\n", + " <td>10</td>\n", + " <td>6</td>\n", + " <td>6</td>\n", + " <td>7</td>\n", + " <td>6</td>\n", + " <td>5</td>\n", + " <td>5</td>\n", + " <td>8</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>219 rows × 578 columns</p>\n", + "</div>" + ], + "text/plain": [ + " ABAT ABHD5 ABLIM1 ABLIM3 ACAA1 ACADM ACADVL ACD ACLY ACOT11 \\\n", + "0 186 2603 42653 220 2132 22869 19775 4486 8835 2332 \n", + "1 93 1137 16493 69 1816 17788 16870 7993 21434 2211 \n", + "2 198 5593 53918 263 3490 39276 25847 4413 9212 7419 \n", + "3 104 1636 19203 127 1518 17951 16854 12800 11939 5136 \n", + "4 205 4720 56984 495 3309 24427 28197 5718 8192 6748 \n", + ".. ... ... ... ... ... ... ... ... ... ... \n", + "214 4 5 9 6 7 7 10 8 7 7 \n", + "215 5 5 8 8 8 8 9 8 6 7 \n", + "216 4 5 9 6 7 8 9 8 7 7 \n", + "217 5 4 9 7 8 8 9 8 6 7 \n", + "218 4 4 7 6 8 8 10 8 7 7 \n", + "\n", + " ... XYLT1 YOD1 YTHDC1 ZBTB16 ZDHHC13 ZFP64 ZNF185 ZNF365 ZNF426 \\\n", + "0 ... 392 222 295 4598 7009 568 65123 56 308 \n", + "1 ... 62 78 144 2132 2602 1720 13531 47 140 \n", + "2 ... 481 355 308 1071 10289 379 65131 206 1251 \n", + "3 ... 213 122 244 482 3578 1990 37715 66 361 \n", + "4 ... 169 275 200 3632 7275 509 65138 188 587 \n", + ".. ... ... ... ... ... ... ... ... ... ... \n", + "214 ... 7 4 9 6 7 7 7 6 5 \n", + "215 ... 7 4 9 6 6 7 9 6 5 \n", + "216 ... 7 5 9 6 6 7 8 6 5 \n", + "217 ... 7 4 10 6 8 7 8 6 5 \n", + "218 ... 6 5 10 6 6 7 6 5 5 \n", + "\n", + " ZNF710 \n", + "0 10385 \n", + "1 6441 \n", + "2 11768 \n", + "3 8517 \n", + "4 9390 \n", + ".. ... \n", + "214 8 \n", + "215 8 \n", + "216 8 \n", + "217 8 \n", + "218 8 \n", + "\n", + "[219 rows x 578 columns]" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "1cc528fb", + "metadata": {}, + "outputs": [], + "source": [ + "# LASSO model:\n", + "lasso = Lasso(alpha=1)\n", + "# fitting the model:\n", + "lasso.fit(X, y)\n", + "# select all coefficients and the feature names\n", + "lasso_coefs = lasso.coef_\n", + "feature_names = X.columns\n", + "\n", + "# collect the selected features:\n", + "selected_feature_indices = np.nonzero(lasso_coefs)[0]\n", + "selected_features = [feature_names[i] for i in selected_feature_indices]\n", + "X_selected = X.iloc[:, selected_feature_indices]" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "8afa29ae", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "98" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(selected_features)" + ] + }, + { + "cell_type": "markdown", + "id": "6cee6462", + "metadata": {}, + "source": [ + "# Test train split" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "3af09ef8", + "metadata": {}, + "outputs": [], + "source": [ + "X_train = X_selected\n", + "y_train = y" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "129430e6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(index\n", + " 0 30\n", + " 1 30\n", + " Name: count, dtype: int64,\n", + " index\n", + " 0 111\n", + " 1 108\n", + " Name: count, dtype: int64)" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_test.value_counts(),y_train.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "1cfe2a06", + "metadata": {}, + "source": [ + "# Cross validation" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "1fbca4b8", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVC\n", + "# we can add class_weight='balanced' to add panalize mistake\n", + "svm_model = SVC(kernel = \"linear\", probability=True,random_state=47)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "id": "0502e118", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Defining parameter range\n", + "param_grid = {\n", + " 'C': [0.0005,0.0001,0.001,0.1]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "7f2d18b0", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "grid = GridSearchCV(svm_model, param_grid, refit=True, verbose=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "79790f1d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 4 candidates, totalling 20 fits\n", + "[CV 1/5] END ..........................C=0.0005;, score=0.955 total time= 0.0s\n", + "[CV 2/5] END ..........................C=0.0005;, score=0.886 total time= 0.0s\n", + "[CV 3/5] END ..........................C=0.0005;, score=0.955 total time= 0.0s\n", + "[CV 4/5] END ..........................C=0.0005;, score=0.909 total time= 0.0s\n", + "[CV 5/5] END ..........................C=0.0005;, score=0.884 total time= 0.0s\n", + "[CV 1/5] END ..........................C=0.0001;, score=0.909 total time= 0.0s\n", + "[CV 2/5] END ..........................C=0.0001;, score=0.773 total time= 0.0s\n", + "[CV 3/5] END ..........................C=0.0001;, score=0.955 total time= 0.0s\n", + "[CV 4/5] END ..........................C=0.0001;, score=0.909 total time= 0.0s\n", + "[CV 5/5] END ..........................C=0.0001;, score=0.860 total time= 0.0s\n", + "[CV 1/5] END ...........................C=0.001;, score=0.955 total time= 0.0s\n", + "[CV 2/5] END ...........................C=0.001;, score=0.977 total time= 0.0s\n", + "[CV 3/5] END ...........................C=0.001;, score=0.955 total time= 0.0s\n", + "[CV 4/5] END ...........................C=0.001;, score=0.909 total time= 0.0s\n", + "[CV 5/5] END ...........................C=0.001;, score=0.884 total time= 0.0s\n", + "[CV 1/5] END .............................C=0.1;, score=0.864 total time= 0.0s\n", + "[CV 2/5] END .............................C=0.1;, score=0.955 total time= 0.0s\n", + "[CV 3/5] END .............................C=0.1;, score=0.977 total time= 0.0s\n", + "[CV 4/5] END .............................C=0.1;, score=0.977 total time= 0.0s\n", + "[CV 5/5] END .............................C=0.1;, score=0.953 total time= 0.0s\n" + ] + }, + { + "data": { + "text/html": [ + "<style>#sk-container-id-5 {color: black;background-color: white;}#sk-container-id-5 pre{padding: 0;}#sk-container-id-5 div.sk-toggleable {background-color: white;}#sk-container-id-5 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-5 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-5 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-5 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-5 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-5 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-5 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-5 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-5 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-5 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-5 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-5 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-5 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-5 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-5 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-5 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-5 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-5 div.sk-item {position: relative;z-index: 1;}#sk-container-id-5 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-5 div.sk-item::before, #sk-container-id-5 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-5 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-5 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-5 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-5 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-5 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-5 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-5 div.sk-label-container {text-align: center;}#sk-container-id-5 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-5 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-5\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(estimator=SVC(kernel='linear', probability=True, random_state=47),\n", + " param_grid={'C': [0.0005, 0.0001, 0.001, 0.1]}, verbose=3)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-11\" type=\"checkbox\" ><label for=\"sk-estimator-id-11\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">GridSearchCV</label><div class=\"sk-toggleable__content\"><pre>GridSearchCV(estimator=SVC(kernel='linear', probability=True, random_state=47),\n", + " param_grid={'C': [0.0005, 0.0001, 0.001, 0.1]}, verbose=3)</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-12\" type=\"checkbox\" ><label for=\"sk-estimator-id-12\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">estimator: SVC</label><div class=\"sk-toggleable__content\"><pre>SVC(kernel='linear', probability=True, random_state=47)</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-13\" type=\"checkbox\" ><label for=\"sk-estimator-id-13\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">SVC</label><div class=\"sk-toggleable__content\"><pre>SVC(kernel='linear', probability=True, random_state=47)</pre></div></div></div></div></div></div></div></div></div></div>" + ], + "text/plain": [ + "GridSearchCV(estimator=SVC(kernel='linear', probability=True, random_state=47),\n", + " param_grid={'C': [0.0005, 0.0001, 0.001, 0.1]}, verbose=3)" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fitting the model for grid search\n", + "grid.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "5d327876", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'C': 0.1}\n", + "SVC(C=0.1, kernel='linear', probability=True, random_state=47)\n" + ] + } + ], + "source": [ + "# print best parameter after tuning\n", + "print(grid.best_params_)\n", + " \n", + "# print how our model looks after hyper-parameter tuning\n", + "print(grid.best_estimator_)" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "f8d67e2f", + "metadata": {}, + "outputs": [], + "source": [ + "# Select columns in df1 based on columns in df2\n", + "X_test = X_test.loc[:, X_train.columns]" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "c8c233d6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.97 0.93 0.95 30\n", + " 1 0.94 0.97 0.95 30\n", + "\n", + " accuracy 0.95 60\n", + " macro avg 0.95 0.95 0.95 60\n", + "weighted avg 0.95 0.95 0.95 60\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.metrics import classification_report, confusion_matrix\n", + "grid_predictions = grid.predict(X_test)\n", + "print(classification_report(y_test, grid_predictions))" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "3b2776c0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<style>#sk-container-id-6 {color: black;background-color: white;}#sk-container-id-6 pre{padding: 0;}#sk-container-id-6 div.sk-toggleable {background-color: white;}#sk-container-id-6 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-6 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-6 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-6 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-6 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-6 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-6 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-6 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-6 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-6 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-6 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-6 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-6 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-6 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-6 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-6 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-6 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-6 div.sk-item {position: relative;z-index: 1;}#sk-container-id-6 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-6 div.sk-item::before, #sk-container-id-6 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-6 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-6 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-6 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-6 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-6 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-6 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-6 div.sk-label-container {text-align: center;}#sk-container-id-6 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-6 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-6\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>SVC(C=0.1, kernel='linear', probability=True, random_state=47)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-14\" type=\"checkbox\" checked><label for=\"sk-estimator-id-14\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">SVC</label><div class=\"sk-toggleable__content\"><pre>SVC(C=0.1, kernel='linear', probability=True, random_state=47)</pre></div></div></div></div></div>" + ], + "text/plain": [ + "SVC(C=0.1, kernel='linear', probability=True, random_state=47)" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_svm = grid.best_estimator_\n", + "model_svm.fit(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "94871ada", + "metadata": {}, + "outputs": [], + "source": [ + "y_proba = model_svm.fit(X_train, y_train).predict_proba(X_test)[:,1]" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "f8d4142d", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import StratifiedKFold\n", + "from sklearn.feature_selection import SelectKBest, f_classif\n", + "from sklearn.metrics import auc\n", + "def roc(X_train,y_train,model,label):\n", + " cv = StratifiedKFold(n_splits=6)\n", + " classifier = model\n", + " tprs = []\n", + " aucs = []\n", + " mean_fpr = np.linspace(0, 1, 100)\n", + "\n", + " fig, ax = plt.subplots(figsize=(6, 6))\n", + " for fold, (train, test) in enumerate(cv.split(X_train, y_train)):\n", + " classifier.fit(X_train.iloc[train], y_train.iloc[train])\n", + " viz = RocCurveDisplay.from_estimator(\n", + " classifier,\n", + " X_train.iloc[test],\n", + " y_train.iloc[test],\n", + " name=f\"ROC fold {fold}\",\n", + " alpha=0.3,\n", + " lw=1,\n", + " ax=ax,\n", + " )\n", + " interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)\n", + " interp_tpr[0] = 0.0\n", + " tprs.append(interp_tpr)\n", + " aucs.append(viz.roc_auc)\n", + " ax.plot([0, 1], [0, 1], \"k--\", label=\"chance level (AUC = 0.5)\")\n", + "\n", + " mean_tpr = np.mean(tprs, axis=0)\n", + " mean_tpr[-1] = 1.0\n", + " mean_auc = auc(mean_fpr, mean_tpr)\n", + " std_auc = np.std(aucs)\n", + " ax.plot(\n", + " mean_fpr,\n", + " mean_tpr,\n", + " color=\"b\",\n", + " label=r\"Mean ROC (AUC = %0.2f $\\pm$ %0.2f)\" % (mean_auc, std_auc),\n", + " lw=2,\n", + " alpha=0.8,\n", + " )\n", + "\n", + " std_tpr = np.std(tprs, axis=0)\n", + " tprs_upper = np.minimum(mean_tpr + std_tpr, 1)\n", + " tprs_lower = np.maximum(mean_tpr - std_tpr, 0)\n", + " ax.fill_between(\n", + " mean_fpr,\n", + " tprs_lower,\n", + " tprs_upper,\n", + " color=\"grey\",\n", + " alpha=0.2,\n", + " label=r\"$\\pm$ 1 std. dev.\",\n", + " )\n", + "\n", + " ax.set(\n", + " xlim=[-0.05, 1.05],\n", + " ylim=[-0.05, 1.05],\n", + " xlabel=\"False Positive Rate\",\n", + " ylabel=\"True Positive Rate\",\n", + " title=label,\n", + " )\n", + " ax.axis(\"square\")\n", + " ax.legend(loc=\"lower right\")\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "802a96a5", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 600x600 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "model = svm_model\n", + "label=\"ROC curve of training data\"\n", + "roc(X_train,y_train,model,label)" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "5b8b6681", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 600x600 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "label=\"ROC curve of testing data\"\n", + "roc(X_test,y_test,model,label)" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "033ca70b", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 2 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#######CONFUSION MATRIX ###########\n", + "from sklearn import metrics\n", + "y_test_pred_svm = model_svm.predict(X_test)\n", + "confusion_matrix_test = metrics.confusion_matrix(y_test, y_test_pred_svm)\n", + "cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix_test, display_labels = [False, True])\n", + "cm_display.plot()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "2f9bc4a1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy : 0.95\n", + "Sensitivity : 0.9333333333333333\n", + "Specificity : 0.9666666666666667\n" + ] + } + ], + "source": [ + "total1=sum(sum(confusion_matrix_test))\n", + "#####from confusion matrix calculate accuracy\n", + "accuracy1=(confusion_matrix_test[0,0]+confusion_matrix_test[1,1])/total1\n", + "print ('Accuracy : ', accuracy1)\n", + "\n", + "sensitivity1 = confusion_matrix_test[0,0]/(confusion_matrix_test[0,0]+confusion_matrix_test[0,1])\n", + "print('Sensitivity : ', sensitivity1 )\n", + "\n", + "specificity1 = confusion_matrix_test[1,1]/(confusion_matrix_test[1,0]+confusion_matrix_test[1,1])\n", + "print('Specificity : ', specificity1)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "8d6a7110", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 2 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#######CONFUSION MATRIX ###########\n", + "y_train_pred_svm = model_svm.predict(X_train)\n", + "confusion_matrix_train = metrics.confusion_matrix(y_train, y_train_pred_svm)\n", + "cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix_train)\n", + "cm_display.plot()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 226, + "id": "81d0fac2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy : 0.9863013698630136\n", + "Sensitivity : 0.990990990990991\n", + "Specificity : 0.9814814814814815\n" + ] + } + ], + "source": [ + "total1=sum(sum(confusion_matrix_train))\n", + "#####from confusion matrix calculate accuracy\n", + "accuracy1=(confusion_matrix_train[0,0]+confusion_matrix_train[1,1])/total1\n", + "print ('Accuracy : ', accuracy1)\n", + "\n", + "sensitivity1 = confusion_matrix_train[0,0]/(confusion_matrix_train[0,0]+confusion_matrix_train[0,1])\n", + "print('Sensitivity : ', sensitivity1 )\n", + "\n", + "specificity1 = confusion_matrix_train[1,1]/(confusion_matrix_train[1,0]+confusion_matrix_train[1,1])\n", + "print('Specificity : ', specificity1)" + ] + }, + { + "cell_type": "code", + "execution_count": 227, + "id": "c1095af0", + "metadata": {}, + "outputs": [], + "source": [ + "# for important features:\n", + "important_feat = model_svm.coef_[0]\n", + "#get indices of those important features\n", + "idx = important_feat.argsort(kind= \"quicksort\")\n", + "idx= idx[::-1][:50]" + ] + }, + { + "cell_type": "code", + "execution_count": 228, + "id": "ae7e0162", + "metadata": {}, + "outputs": [], + "source": [ + "df1 = X_selected.T" + ] + }, + { + "cell_type": "code", + "execution_count": 229, + "id": "1d97f818", + "metadata": {}, + "outputs": [], + "source": [ + "top_met = df1.iloc[idx]" + ] + }, + { + "cell_type": "code", + "execution_count": 230, + "id": "4cd4227b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['COL1A1', 'ECT2', 'COL5A2', 'MCM2', 'MMP10', 'RPN1', 'TRIP13', 'FSCN1',\n", + " 'HSPBAP1', 'IGF2BP2', 'EFNA1', 'IGFBP3', 'EMP1', 'TMPRSS11E', 'PSMB9',\n", + " 'GABRP', 'NT5C2', 'RHCG', 'PITX1', 'RUVBL1', 'CYP4B1', 'SLC2A1',\n", + " 'LYPD3', 'GALNT1', 'IL1RN', 'TAPBP', 'DHRS2', 'SPRR3', 'SPINK5',\n", + " 'SCNN1A', 'TYMP', 'LAMC2', 'LEPROTL1', 'TSPAN6', 'INPP1', 'STK24',\n", + " 'SERPINB2', 'CRNN', 'MYH10', 'ECM1', 'HOPX', 'TFAP2B', 'IFI35',\n", + " 'TMPRSS11D', 'UCHL1', 'KRT4', 'AQP3', 'ACLY', 'ATP6V1D', 'TST'],\n", + " dtype='object')" + ] + }, + "execution_count": 230, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top_met.index" + ] + }, + { + "cell_type": "code", + "execution_count": 232, + "id": "8f6d88bb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['ACLY', 'ACPP', 'AIM2', 'ALDH9A1', 'ALOX12', 'ANO1', 'AQP3', 'ATP6V1D',\n", + " 'CCNG2', 'CES2', 'CFD', 'CH25H', 'CLIC3', 'COL1A1', 'COL5A2', 'CRABP2',\n", + " 'CRISP3', 'CRNN', 'CYP4B1', 'DHRS1', 'DHRS2', 'DUOX1', 'DUSP5', 'ECM1',\n", + " 'ECT2', 'EFNA1', 'EMP1', 'ENTPD6', 'ERCC3', 'FLG', 'FSCN1', 'GABRP',\n", + " 'GALE', 'GALNT1', 'GPX3', 'HOPX', 'HSPB8', 'HSPBAP1', 'HSPD1', 'ID4',\n", + " 'IFI35', 'IGF2BP2', 'IGFBP3', 'IL1RN', 'INPP1', 'KANK1', 'KLK13',\n", + " 'KRT4', 'LAMC2', 'LCN2', 'LEPROTL1', 'LYPD3', 'MAL', 'MCM2', 'MMP10',\n", + " 'MUC1', 'MYH10', 'NDRG2', 'NT5C2', 'PCSK5', 'PHLDA1', 'PITX1',\n", + " 'PPP1R3C', 'PSMB9', 'PTN', 'RAB11FIP1', 'RANBP9', 'RHCG', 'RND3',\n", + " 'RPN1', 'RUVBL1', 'SCNN1A', 'SERPINB13', 'SERPINB2', 'SIM2', 'SLC2A1',\n", + " 'SLK', 'SLURP1', 'SPINK5', 'SPRR3', 'SSRP1', 'STK24', 'SYNPO2L',\n", + " 'TAPBP', 'TFAP2B', 'TGIF1', 'TIAM1', 'TJP1', 'TMF1', 'TMPRSS11D',\n", + " 'TMPRSS11E', 'TRIP13', 'TSPAN6', 'TST', 'TYMP', 'UCHL1', 'ZBTB16',\n", + " 'ZNF185'],\n", + " dtype='object')" + ] + }, + "execution_count": 232, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_selected.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d9ff727", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b304c6c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c76098bb", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "321b6028", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab