Update file DS_miRNA_limma_dataset_xgb_final-F.ipynb

24a860bc · aakan96 · 8c673fd1 · 24a860bc
Commit 24a860bc authored 2 years ago by aakan96
--- a/Machine Learning/DS_miRNA_limma_dataset_xgb_final-F.ipynb
+++ b/Machine Learning/DS_miRNA_limma_dataset_xgb_final-F.ipynb
@@ -91,391 +91,6 @@
    "df=df.reset_index()"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "id": "1647a959",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>index</th>\n",
-       "      <th>dmr_3</th>\n",
-       "      <th>dmr_31a</th>\n",
-       "      <th>dmr_6</th>\n",
-       "      <th>ebv-miR-BART13</th>\n",
-       "      <th>hsa-let-7c</th>\n",
-       "      <th>hsa-let-7d-5p</th>\n",
-       "      <th>hsa-let-7i-5p</th>\n",
-       "      <th>hsa-miR-100-5p</th>\n",
-       "      <th>hsa-miR-101-3p</th>\n",
-       "      <th>...</th>\n",
-       "      <th>hsv2-miR-H24</th>\n",
-       "      <th>hsv2-miR-H25</th>\n",
-       "      <th>hsv2-miR-H6</th>\n",
-       "      <th>hur_1</th>\n",
-       "      <th>hur_2</th>\n",
-       "      <th>hur_4</th>\n",
-       "      <th>hur_5</th>\n",
-       "      <th>hur_6</th>\n",
-       "      <th>miRNABrightCorner30</th>\n",
-       "      <th>mr_1</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>GSM1069774</td>\n",
-       "      <td>0.732675</td>\n",
-       "      <td>-0.242559</td>\n",
-       "      <td>0.577801</td>\n",
-       "      <td>-4.469532</td>\n",
-       "      <td>1.195899</td>\n",
-       "      <td>-0.334742</td>\n",
-       "      <td>0.89199</td>\n",
-       "      <td>-2.089223</td>\n",
-       "      <td>-2.757097</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-3.956004</td>\n",
-       "      <td>-3.936689</td>\n",
-       "      <td>-4.099346</td>\n",
-       "      <td>6.98856</td>\n",
-       "      <td>7.041557</td>\n",
-       "      <td>3.822267</td>\n",
-       "      <td>-2.268209</td>\n",
-       "      <td>5.114399</td>\n",
-       "      <td>2.017444</td>\n",
-       "      <td>1.640437</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>GSM1069775</td>\n",
-       "      <td>0.249772</td>\n",
-       "      <td>-0.655514</td>\n",
-       "      <td>0.104933</td>\n",
-       "      <td>-5.209572</td>\n",
-       "      <td>0.498366</td>\n",
-       "      <td>-0.194772</td>\n",
-       "      <td>0.637863</td>\n",
-       "      <td>-2.357572</td>\n",
-       "      <td>-2.196884</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-4.334103</td>\n",
-       "      <td>-4.561624</td>\n",
-       "      <td>-4.719714</td>\n",
-       "      <td>6.774479</td>\n",
-       "      <td>6.862654</td>\n",
-       "      <td>3.529789</td>\n",
-       "      <td>-2.656642</td>\n",
-       "      <td>4.327117</td>\n",
-       "      <td>2.022346</td>\n",
-       "      <td>0.79426</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>GSM1069776</td>\n",
-       "      <td>0.400779</td>\n",
-       "      <td>-0.597444</td>\n",
-       "      <td>0.232702</td>\n",
-       "      <td>-4.952808</td>\n",
-       "      <td>1.081166</td>\n",
-       "      <td>0.249982</td>\n",
-       "      <td>1.45018</td>\n",
-       "      <td>-1.138559</td>\n",
-       "      <td>-1.802774</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-4.550077</td>\n",
-       "      <td>-4.40729</td>\n",
-       "      <td>-4.621278</td>\n",
-       "      <td>6.808404</td>\n",
-       "      <td>6.75867</td>\n",
-       "      <td>3.496675</td>\n",
-       "      <td>-2.676555</td>\n",
-       "      <td>4.616284</td>\n",
-       "      <td>1.498011</td>\n",
-       "      <td>1.584544</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>GSM1069777</td>\n",
-       "      <td>0.380263</td>\n",
-       "      <td>-0.900491</td>\n",
-       "      <td>0.243207</td>\n",
-       "      <td>-4.892073</td>\n",
-       "      <td>-0.023958</td>\n",
-       "      <td>-0.980435</td>\n",
-       "      <td>1.071857</td>\n",
-       "      <td>-2.077406</td>\n",
-       "      <td>-2.11406</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-4.018911</td>\n",
-       "      <td>-4.203106</td>\n",
-       "      <td>-3.938707</td>\n",
-       "      <td>6.524773</td>\n",
-       "      <td>6.497959</td>\n",
-       "      <td>3.541502</td>\n",
-       "      <td>-3.073553</td>\n",
-       "      <td>4.581648</td>\n",
-       "      <td>0.789822</td>\n",
-       "      <td>1.255367</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>GSM1069778</td>\n",
-       "      <td>0.422207</td>\n",
-       "      <td>-0.414831</td>\n",
-       "      <td>-0.000781</td>\n",
-       "      <td>-5.139127</td>\n",
-       "      <td>1.077485</td>\n",
-       "      <td>-0.684875</td>\n",
-       "      <td>0.724751</td>\n",
-       "      <td>-0.689096</td>\n",
-       "      <td>-1.182558</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-3.690971</td>\n",
-       "      <td>-4.332452</td>\n",
-       "      <td>-4.178727</td>\n",
-       "      <td>6.562608</td>\n",
-       "      <td>6.529399</td>\n",
-       "      <td>3.305132</td>\n",
-       "      <td>-2.964948</td>\n",
-       "      <td>4.487481</td>\n",
-       "      <td>1.219583</td>\n",
-       "      <td>0.951615</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>233</th>\n",
-       "      <td>GSM1070007</td>\n",
-       "      <td>0.98797</td>\n",
-       "      <td>-0.118186</td>\n",
-       "      <td>0.750199</td>\n",
-       "      <td>-4.572984</td>\n",
-       "      <td>0.696251</td>\n",
-       "      <td>-1.089669</td>\n",
-       "      <td>0.826</td>\n",
-       "      <td>-1.604393</td>\n",
-       "      <td>-2.87334</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-2.163581</td>\n",
-       "      <td>-2.15805</td>\n",
-       "      <td>-2.302647</td>\n",
-       "      <td>7.093144</td>\n",
-       "      <td>7.150126</td>\n",
-       "      <td>3.899704</td>\n",
-       "      <td>-2.954284</td>\n",
-       "      <td>5.505105</td>\n",
-       "      <td>2.457963</td>\n",
-       "      <td>2.142301</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>234</th>\n",
-       "      <td>GSM1070008</td>\n",
-       "      <td>-0.194781</td>\n",
-       "      <td>-0.710519</td>\n",
-       "      <td>-0.700226</td>\n",
-       "      <td>-5.651293</td>\n",
-       "      <td>0.742722</td>\n",
-       "      <td>-0.964527</td>\n",
-       "      <td>0.570816</td>\n",
-       "      <td>-1.046029</td>\n",
-       "      <td>-1.840615</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-4.507365</td>\n",
-       "      <td>-4.23831</td>\n",
-       "      <td>-4.63219</td>\n",
-       "      <td>6.18658</td>\n",
-       "      <td>6.232722</td>\n",
-       "      <td>2.788619</td>\n",
-       "      <td>-3.103706</td>\n",
-       "      <td>4.340513</td>\n",
-       "      <td>0.232713</td>\n",
-       "      <td>1.067806</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>235</th>\n",
-       "      <td>GSM1070009</td>\n",
-       "      <td>0.21218</td>\n",
-       "      <td>-0.284657</td>\n",
-       "      <td>-0.32472</td>\n",
-       "      <td>-4.800142</td>\n",
-       "      <td>1.0062</td>\n",
-       "      <td>-0.141699</td>\n",
-       "      <td>0.80704</td>\n",
-       "      <td>-0.993146</td>\n",
-       "      <td>-0.823621</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-2.737709</td>\n",
-       "      <td>-2.644713</td>\n",
-       "      <td>-3.253632</td>\n",
-       "      <td>6.505956</td>\n",
-       "      <td>6.548781</td>\n",
-       "      <td>3.12575</td>\n",
-       "      <td>-2.917537</td>\n",
-       "      <td>4.838599</td>\n",
-       "      <td>0.863574</td>\n",
-       "      <td>1.203499</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>236</th>\n",
-       "      <td>GSM1070010</td>\n",
-       "      <td>0.330997</td>\n",
-       "      <td>-0.19446</td>\n",
-       "      <td>-0.206405</td>\n",
-       "      <td>-4.840442</td>\n",
-       "      <td>1.521159</td>\n",
-       "      <td>-0.424901</td>\n",
-       "      <td>0.886358</td>\n",
-       "      <td>-0.031455</td>\n",
-       "      <td>-1.584939</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-3.292034</td>\n",
-       "      <td>-2.941633</td>\n",
-       "      <td>-3.939222</td>\n",
-       "      <td>6.790132</td>\n",
-       "      <td>6.829164</td>\n",
-       "      <td>3.365475</td>\n",
-       "      <td>-2.736411</td>\n",
-       "      <td>5.185601</td>\n",
-       "      <td>0.846454</td>\n",
-       "      <td>1.604729</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>237</th>\n",
-       "      <td>GSM1070011</td>\n",
-       "      <td>0.474815</td>\n",
-       "      <td>0.043697</td>\n",
-       "      <td>-0.102511</td>\n",
-       "      <td>-4.849285</td>\n",
-       "      <td>1.239637</td>\n",
-       "      <td>-0.704124</td>\n",
-       "      <td>0.698355</td>\n",
-       "      <td>-0.414715</td>\n",
-       "      <td>-1.721427</td>\n",
-       "      <td>...</td>\n",
-       "      <td>-3.378909</td>\n",
-       "      <td>-2.909732</td>\n",
-       "      <td>-3.510667</td>\n",
-       "      <td>6.80237</td>\n",
-       "      <td>6.784016</td>\n",
-       "      <td>3.514036</td>\n",
-       "      <td>-2.931018</td>\n",
-       "      <td>4.798139</td>\n",
-       "      <td>2.08952</td>\n",
-       "      <td>1.597958</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>238 rows × 231 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "          index     dmr_3   dmr_31a     dmr_6 ebv-miR-BART13 hsa-let-7c  \\\n",
-       "0    GSM1069774  0.732675 -0.242559  0.577801      -4.469532   1.195899   \n",
-       "1    GSM1069775  0.249772 -0.655514  0.104933      -5.209572   0.498366   \n",
-       "2    GSM1069776  0.400779 -0.597444  0.232702      -4.952808   1.081166   \n",
-       "3    GSM1069777  0.380263 -0.900491  0.243207      -4.892073  -0.023958   \n",
-       "4    GSM1069778  0.422207 -0.414831 -0.000781      -5.139127   1.077485   \n",
-       "..          ...       ...       ...       ...            ...        ...   \n",
-       "233  GSM1070007   0.98797 -0.118186  0.750199      -4.572984   0.696251   \n",
-       "234  GSM1070008 -0.194781 -0.710519 -0.700226      -5.651293   0.742722   \n",
-       "235  GSM1070009   0.21218 -0.284657  -0.32472      -4.800142     1.0062   \n",
-       "236  GSM1070010  0.330997  -0.19446 -0.206405      -4.840442   1.521159   \n",
-       "237  GSM1070011  0.474815  0.043697 -0.102511      -4.849285   1.239637   \n",
-       "\n",
-       "    hsa-let-7d-5p hsa-let-7i-5p hsa-miR-100-5p hsa-miR-101-3p  ...  \\\n",
-       "0       -0.334742       0.89199      -2.089223      -2.757097  ...   \n",
-       "1       -0.194772      0.637863      -2.357572      -2.196884  ...   \n",
-       "2        0.249982       1.45018      -1.138559      -1.802774  ...   \n",
-       "3       -0.980435      1.071857      -2.077406       -2.11406  ...   \n",
-       "4       -0.684875      0.724751      -0.689096      -1.182558  ...   \n",
-       "..            ...           ...            ...            ...  ...   \n",
-       "233     -1.089669         0.826      -1.604393       -2.87334  ...   \n",
-       "234     -0.964527      0.570816      -1.046029      -1.840615  ...   \n",
-       "235     -0.141699       0.80704      -0.993146      -0.823621  ...   \n",
-       "236     -0.424901      0.886358      -0.031455      -1.584939  ...   \n",
-       "237     -0.704124      0.698355      -0.414715      -1.721427  ...   \n",
-       "\n",
-       "    hsv2-miR-H24 hsv2-miR-H25 hsv2-miR-H6     hur_1     hur_2     hur_4  \\\n",
-       "0      -3.956004    -3.936689   -4.099346   6.98856  7.041557  3.822267   \n",
-       "1      -4.334103    -4.561624   -4.719714  6.774479  6.862654  3.529789   \n",
-       "2      -4.550077     -4.40729   -4.621278  6.808404   6.75867  3.496675   \n",
-       "3      -4.018911    -4.203106   -3.938707  6.524773  6.497959  3.541502   \n",
-       "4      -3.690971    -4.332452   -4.178727  6.562608  6.529399  3.305132   \n",
-       "..           ...          ...         ...       ...       ...       ...   \n",
-       "233    -2.163581     -2.15805   -2.302647  7.093144  7.150126  3.899704   \n",
-       "234    -4.507365     -4.23831    -4.63219   6.18658  6.232722  2.788619   \n",
-       "235    -2.737709    -2.644713   -3.253632  6.505956  6.548781   3.12575   \n",
-       "236    -3.292034    -2.941633   -3.939222  6.790132  6.829164  3.365475   \n",
-       "237    -3.378909    -2.909732   -3.510667   6.80237  6.784016  3.514036   \n",
-       "\n",
-       "        hur_5     hur_6 miRNABrightCorner30      mr_1  \n",
-       "0   -2.268209  5.114399            2.017444  1.640437  \n",
-       "1   -2.656642  4.327117            2.022346   0.79426  \n",
-       "2   -2.676555  4.616284            1.498011  1.584544  \n",
-       "3   -3.073553  4.581648            0.789822  1.255367  \n",
-       "4   -2.964948  4.487481            1.219583  0.951615  \n",
-       "..        ...       ...                 ...       ...  \n",
-       "233 -2.954284  5.505105            2.457963  2.142301  \n",
-       "234 -3.103706  4.340513            0.232713  1.067806  \n",
-       "235 -2.917537  4.838599            0.863574  1.203499  \n",
-       "236 -2.736411  5.185601            0.846454  1.604729  \n",
-       "237 -2.931018  4.798139             2.08952  1.597958  \n",
-       "\n",
-       "[238 rows x 231 columns]"
-      ]
-     },
-     "execution_count": 43,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": 44,

 %% Cell type:code id:f097ad55 tags:
 ``` python
 import warnings
 warnings.filterwarnings('ignore')
 import pandas as pd
 from sklearn.model_selection import train_test_split
 #from sklearn.model_selection import cross_val_score
 #from sklearn.metrics import accuracy_score
 #import sklearn.metrics as metrics
 #from sklearn.metrics import auc
 from sklearn.metrics import RocCurveDisplay
 #from sklearn.model_selection import KFold
 import matplotlib.pyplot as plt
 import numpy as np
 from imblearn.over_sampling import SMOTE
 from sklearn.linear_model import Lasso
 import xgboost as xgb
 from sklearn.model_selection import GridSearchCV
 import pandas as pd
 import numpy as np
 #np.random.seed(7)
 ```
 %% Cell type:markdown id:73b6611a tags:
 # Data Preprocessing
 %% Cell type:code id:0eeb7a35 tags:
 ``` python
 df = pd.read_csv("DS/miRNA_DS_preprocessed_data.csv")
 ```
 %% Cell type:code id:6e7836e1 tags:
 ``` python
 df.shape
 ```
 %% Output
    (230, 239)
 %% Cell type:code id:683b63ce tags:
 ``` python
 df = df.T
 ```
 %% Cell type:code id:2e78017d tags:
 ``` python
 #Transform the input data
 df.rename(columns=df.iloc[0], inplace = True)
 df.drop(df.index[0], inplace = True)
 df=df.reset_index()
 ```
-%% Cell type:code id:1647a959 tags:
-``` python
-df
-```
-%% Output
-              index     dmr_3   dmr_31a     dmr_6 ebv-miR-BART13 hsa-let-7c  \
-    0    GSM1069774  0.732675 -0.242559  0.577801      -4.469532   1.195899
-    1    GSM1069775  0.249772 -0.655514  0.104933      -5.209572   0.498366
-    2    GSM1069776  0.400779 -0.597444  0.232702      -4.952808   1.081166
-    3    GSM1069777  0.380263 -0.900491  0.243207      -4.892073  -0.023958
-    4    GSM1069778  0.422207 -0.414831 -0.000781      -5.139127   1.077485
-    ..          ...       ...       ...       ...            ...        ...
-    233  GSM1070007   0.98797 -0.118186  0.750199      -4.572984   0.696251
-    234  GSM1070008 -0.194781 -0.710519 -0.700226      -5.651293   0.742722
-    235  GSM1070009   0.21218 -0.284657  -0.32472      -4.800142     1.0062
-    236  GSM1070010  0.330997  -0.19446 -0.206405      -4.840442   1.521159
-    237  GSM1070011  0.474815  0.043697 -0.102511      -4.849285   1.239637
-        hsa-let-7d-5p hsa-let-7i-5p hsa-miR-100-5p hsa-miR-101-3p  ...  \
-    0       -0.334742       0.89199      -2.089223      -2.757097  ...
-    1       -0.194772      0.637863      -2.357572      -2.196884  ...
-    2        0.249982       1.45018      -1.138559      -1.802774  ...
-    3       -0.980435      1.071857      -2.077406       -2.11406  ...
-    4       -0.684875      0.724751      -0.689096      -1.182558  ...
-    ..            ...           ...            ...            ...  ...
-    233     -1.089669         0.826      -1.604393       -2.87334  ...
-    234     -0.964527      0.570816      -1.046029      -1.840615  ...
-    235     -0.141699       0.80704      -0.993146      -0.823621  ...
-    236     -0.424901      0.886358      -0.031455      -1.584939  ...
-    237     -0.704124      0.698355      -0.414715      -1.721427  ...
-        hsv2-miR-H24 hsv2-miR-H25 hsv2-miR-H6     hur_1     hur_2     hur_4  \
-    0      -3.956004    -3.936689   -4.099346   6.98856  7.041557  3.822267
-    1      -4.334103    -4.561624   -4.719714  6.774479  6.862654  3.529789
-    2      -4.550077     -4.40729   -4.621278  6.808404   6.75867  3.496675
-    3      -4.018911    -4.203106   -3.938707  6.524773  6.497959  3.541502
-    4      -3.690971    -4.332452   -4.178727  6.562608  6.529399  3.305132
-    ..           ...          ...         ...       ...       ...       ...
-    233    -2.163581     -2.15805   -2.302647  7.093144  7.150126  3.899704
-    234    -4.507365     -4.23831    -4.63219   6.18658  6.232722  2.788619
-    235    -2.737709    -2.644713   -3.253632  6.505956  6.548781   3.12575
-    236    -3.292034    -2.941633   -3.939222  6.790132  6.829164  3.365475
-    237    -3.378909    -2.909732   -3.510667   6.80237  6.784016  3.514036
-            hur_5     hur_6 miRNABrightCorner30      mr_1
-    0   -2.268209  5.114399            2.017444  1.640437
-    1   -2.656642  4.327117            2.022346   0.79426
-    2   -2.676555  4.616284            1.498011  1.584544
-    3   -3.073553  4.581648            0.789822  1.255367
-    4   -2.964948  4.487481            1.219583  0.951615
-    ..        ...       ...                 ...       ...
-    233 -2.954284  5.505105            2.457963  2.142301
-    234 -3.103706  4.340513            0.232713  1.067806
-    235 -2.917537  4.838599            0.863574  1.203499
-    236 -2.736411  5.185601            0.846454  1.604729
-    237 -2.931018  4.798139             2.08952  1.597958
-    [238 rows x 231 columns]
 %% Cell type:code id:4c50c510 tags:
 ``` python
 metadata = pd.read_csv("DS/miRNA_DS_metadata_col_info.csv")
 ```
 %% Cell type:code id:6730cf89 tags:
 ``` python
 df= df.merge(metadata, left_on="index", right_on= "Unnamed: 0")
 ```
 %% Cell type:code id:7a8ad8ad tags:
 ``` python
 df['title0'] = df['title0'].replace('(?i)mucosa|normal|healthy', 0, regex=True)
 ```
 %% Cell type:code id:a8cf8643 tags:
 ``` python
 df['title0'] = df['title0'].replace('(?i)Tumor|Cancer|carcinoma', 1, regex=True)
 ```
 %% Cell type:code id:5c852a3f tags:
 ``` python
 df['title0'].value_counts()
 ```
 %% Output
    title0
    1    119
    0    119
    Name: count, dtype: int64
 %% Cell type:code id:f5d203aa tags:
 ``` python
 df = df[pd.to_numeric(df['title0'], errors='coerce').notnull()]#remove all non-numeric data from the column.
 ```
 %% Cell type:code id:523bdaa6 tags:
 ``` python
 df= df.drop(['index', 'Unnamed: 0'], axis=1)
 ```
 %% Cell type:code id:46a6fb36 tags:
 ``` python
 df= df.rename(columns={"title0": "index"})
 ```
 %% Cell type:code id:e26f88c5 tags:
 ``` python
 df['index'].value_counts()
 ```
 %% Output
    index
    1    119
    0    119
    Name: count, dtype: int64
 %% Cell type:code id:fbaf2507 tags:
 ``` python
 df= df.apply(pd.to_numeric)
 ```
 %% Cell type:code id:f3f7adb5 tags:
 ``` python
 df['index'].value_counts()
 ```
 %% Output
    index
    1    119
    0    119
    Name: count, dtype: int64
 %% Cell type:code id:6a50f416 tags:
 ``` python
 X=df.drop("index",axis=1)
 y=df['index']
 ```
 %% Cell type:code id:e644ab0e tags:
 ``` python
 y=y.astype('int')
 ```
 %% Cell type:markdown id:6cee6462 tags:
 # Test train split
 %% Cell type:code id:1da48142 tags:
 ``` python
 # split data into training and testing data-sets
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=7)
 ```
 %% Cell type:code id:129430e6 tags:
 ``` python
 y_test.value_counts(),y_train.value_counts()
 ```
 %% Output
    (index
     0    30
     1    30
     Name: count, dtype: int64,
     index
     0    89
     1    89
     Name: count, dtype: int64)
 %% Cell type:markdown id:1cfe2a06 tags:
 # Cross validation
 %% Cell type:code id:d3550b5e tags:
 ``` python
 model = xgb.XGBClassifier(random_state=42)
 # Defining parameter range
 param_grid = {
    'max_depth': [3,5],
    'learning_rate': [0.1 ,0.01, 0.001],
    'n_estimators': [100,200],
    'gamma': [ 0.1,0.01,0.001],
    'subsample': [1.0]
 }
 grid = GridSearchCV(model, param_grid, refit=True, verbose=3)
 # Fitting the model for grid search
 grid.fit(X_train, y_train)
 ```
 %% Output
    Fitting 5 folds for each of 36 candidates, totalling 180 fits
    [CV 1/5] END gamma=0.1, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=1.000 total time=   0.2s
    [CV 2/5] END gamma=0.1, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=1.000 total time=   0.1s
    [CV 3/5] END gamma=0.1, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.1s
    [CV 4/5] END gamma=0.1, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.1s
    [CV 5/5] END gamma=0.1, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.1s
    [CV 1/5] END gamma=0.1, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.2s
    [CV 2/5] END gamma=0.1, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.2s
    [CV 3/5] END gamma=0.1, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.3s
    [CV 4/5] END gamma=0.1, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.1, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.3s
    [CV 1/5] END gamma=0.1, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=1.000 total time=   0.1s
    [CV 2/5] END gamma=0.1, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=1.000 total time=   0.1s
    [CV 3/5] END gamma=0.1, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.1s
    [CV 4/5] END gamma=0.1, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.1s
    [CV 5/5] END gamma=0.1, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.1s
    [CV 1/5] END gamma=0.1, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.2s
    [CV 2/5] END gamma=0.1, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.2s
    [CV 3/5] END gamma=0.1, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.2s
    [CV 4/5] END gamma=0.1, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.1, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 1/5] END gamma=0.1, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.944 total time=   0.2s
    [CV 2/5] END gamma=0.1, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.889 total time=   0.2s
    [CV 3/5] END gamma=0.1, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.2s
    [CV 4/5] END gamma=0.1, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.1, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.943 total time=   0.2s
    [CV 1/5] END gamma=0.1, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.4s
    [CV 2/5] END gamma=0.1, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=0.917 total time=   0.3s
    [CV 3/5] END gamma=0.1, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.3s
    [CV 4/5] END gamma=0.1, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.3s
    [CV 5/5] END gamma=0.1, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=0.943 total time=   0.4s
    [CV 1/5] END gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.944 total time=   0.2s
    [CV 2/5] END gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.889 total time=   0.2s
    [CV 3/5] END gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.2s
    [CV 4/5] END gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.943 total time=   0.1s
    [CV 1/5] END gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.3s
    [CV 2/5] END gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=0.917 total time=   0.3s
    [CV 3/5] END gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.3s
    [CV 4/5] END gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.4s
    [CV 5/5] END gamma=0.1, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=0.943 total time=   0.3s
    [CV 1/5] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.944 total time=   0.1s
    [CV 2/5] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.889 total time=   0.2s
    [CV 3/5] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.1s
    [CV 4/5] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.943 total time=   0.1s
    [CV 1/5] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.944 total time=   0.4s
    [CV 2/5] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.889 total time=   0.3s
    [CV 3/5] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.4s
    [CV 4/5] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.3s
    [CV 5/5] END gamma=0.1, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.943 total time=   0.3s
    [CV 1/5] END gamma=0.1, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.944 total time=   0.1s
    [CV 2/5] END gamma=0.1, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.889 total time=   0.2s
    [CV 3/5] END gamma=0.1, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.1s
    [CV 4/5] END gamma=0.1, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.1, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.943 total time=   0.2s
    [CV 1/5] END gamma=0.1, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.944 total time=   0.4s
    [CV 2/5] END gamma=0.1, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.889 total time=   0.4s
    [CV 3/5] END gamma=0.1, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.4s
    [CV 4/5] END gamma=0.1, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.4s
    [CV 5/5] END gamma=0.1, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.943 total time=   0.4s
    [CV 1/5] END gamma=0.01, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=1.000 total time=   0.1s
    [CV 2/5] END gamma=0.01, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=1.000 total time=   0.1s
    [CV 3/5] END gamma=0.01, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.2s
    [CV 4/5] END gamma=0.01, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.1s
    [CV 5/5] END gamma=0.01, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.1s
    [CV 1/5] END gamma=0.01, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.2s
    [CV 2/5] END gamma=0.01, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.2s
    [CV 3/5] END gamma=0.01, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.3s
    [CV 4/5] END gamma=0.01, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.01, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 1/5] END gamma=0.01, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=1.000 total time=   0.1s
    [CV 2/5] END gamma=0.01, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=1.000 total time=   0.1s
    [CV 3/5] END gamma=0.01, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.1s
    [CV 4/5] END gamma=0.01, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.1s
    [CV 5/5] END gamma=0.01, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.1s
    [CV 1/5] END gamma=0.01, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.2s
    [CV 2/5] END gamma=0.01, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.2s
    [CV 3/5] END gamma=0.01, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.2s
    [CV 4/5] END gamma=0.01, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.01, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 1/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.944 total time=   0.2s
    [CV 2/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.917 total time=   0.2s
    [CV 3/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.2s
    [CV 4/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.943 total time=   0.2s
    [CV 1/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.3s
    [CV 2/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=0.917 total time=   0.3s
    [CV 3/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.4s
    [CV 4/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.4s
    [CV 5/5] END gamma=0.01, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=0.943 total time=   0.4s
    [CV 1/5] END gamma=0.01, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.944 total time=   0.2s
    [CV 2/5] END gamma=0.01, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.917 total time=   0.2s
    [CV 3/5] END gamma=0.01, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.2s
    [CV 4/5] END gamma=0.01, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.01, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.943 total time=   0.2s
    [CV 1/5] END gamma=0.01, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.3s
    [CV 2/5] END gamma=0.01, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=0.917 total time=   0.3s
    [CV 3/5] END gamma=0.01, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.4s
    [CV 4/5] END gamma=0.01, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.3s
    [CV 5/5] END gamma=0.01, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=0.943 total time=   0.3s
    [CV 1/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.944 total time=   0.1s
    [CV 2/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.889 total time=   0.2s
    [CV 3/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.3s
    [CV 4/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.943 total time=   0.2s
    [CV 1/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.944 total time=   0.4s
    [CV 2/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.889 total time=   0.4s
    [CV 3/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.3s
    [CV 4/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.3s
    [CV 5/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.943 total time=   0.3s
    [CV 1/5] END gamma=0.01, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.944 total time=   0.2s
    [CV 2/5] END gamma=0.01, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.889 total time=   0.1s
    [CV 3/5] END gamma=0.01, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.2s
    [CV 4/5] END gamma=0.01, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.01, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.943 total time=   0.1s
    [CV 1/5] END gamma=0.01, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.944 total time=   0.3s
    [CV 2/5] END gamma=0.01, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.889 total time=   0.4s
    [CV 3/5] END gamma=0.01, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.4s
    [CV 4/5] END gamma=0.01, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.3s
    [CV 5/5] END gamma=0.01, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.943 total time=   0.4s
    [CV 1/5] END gamma=0.001, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=1.000 total time=   0.1s
    [CV 2/5] END gamma=0.001, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=1.000 total time=   0.1s
    [CV 3/5] END gamma=0.001, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.2s
    [CV 4/5] END gamma=0.001, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.001, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 1/5] END gamma=0.001, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.2s
    [CV 2/5] END gamma=0.001, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.2s
    [CV 3/5] END gamma=0.001, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.2s
    [CV 4/5] END gamma=0.001, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.001, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 1/5] END gamma=0.001, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=1.000 total time=   0.1s
    [CV 2/5] END gamma=0.001, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=1.000 total time=   0.2s
    [CV 3/5] END gamma=0.001, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.1s
    [CV 4/5] END gamma=0.001, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.1s
    [CV 5/5] END gamma=0.001, learning_rate=0.1, max_depth=5, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.1s
    [CV 1/5] END gamma=0.001, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.2s
    [CV 2/5] END gamma=0.001, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.2s
    [CV 3/5] END gamma=0.001, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.2s
    [CV 4/5] END gamma=0.001, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.001, learning_rate=0.1, max_depth=5, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 1/5] END gamma=0.001, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.944 total time=   0.2s
    [CV 2/5] END gamma=0.001, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.889 total time=   0.1s
    [CV 3/5] END gamma=0.001, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.2s
    [CV 4/5] END gamma=0.001, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.001, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0;, score=0.943 total time=   0.2s
    [CV 1/5] END gamma=0.001, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.4s
    [CV 2/5] END gamma=0.001, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=0.917 total time=   0.3s
    [CV 3/5] END gamma=0.001, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.3s
    [CV 4/5] END gamma=0.001, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.3s
    [CV 5/5] END gamma=0.001, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=1.0;, score=0.943 total time=   0.3s
    [CV 1/5] END gamma=0.001, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.944 total time=   0.2s
    [CV 2/5] END gamma=0.001, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.889 total time=   0.2s
    [CV 3/5] END gamma=0.001, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.1s
    [CV 4/5] END gamma=0.001, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.001, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0;, score=0.943 total time=   0.2s
    [CV 1/5] END gamma=0.001, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=1.000 total time=   0.4s
    [CV 2/5] END gamma=0.001, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=0.917 total time=   0.4s
    [CV 3/5] END gamma=0.001, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.4s
    [CV 4/5] END gamma=0.001, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.4s
    [CV 5/5] END gamma=0.001, learning_rate=0.01, max_depth=5, n_estimators=200, subsample=1.0;, score=0.943 total time=   0.4s
    [CV 1/5] END gamma=0.001, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.944 total time=   0.2s
    [CV 2/5] END gamma=0.001, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.889 total time=   0.2s
    [CV 3/5] END gamma=0.001, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.2s
    [CV 4/5] END gamma=0.001, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.001, learning_rate=0.001, max_depth=3, n_estimators=100, subsample=1.0;, score=0.943 total time=   0.2s
    [CV 1/5] END gamma=0.001, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.944 total time=   0.3s
    [CV 2/5] END gamma=0.001, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.889 total time=   0.3s
    [CV 3/5] END gamma=0.001, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.4s
    [CV 4/5] END gamma=0.001, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.4s
    [CV 5/5] END gamma=0.001, learning_rate=0.001, max_depth=3, n_estimators=200, subsample=1.0;, score=0.943 total time=   0.3s
    [CV 1/5] END gamma=0.001, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.944 total time=   0.2s
    [CV 2/5] END gamma=0.001, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.889 total time=   0.2s
    [CV 3/5] END gamma=0.001, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.972 total time=   0.2s
    [CV 4/5] END gamma=0.001, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.971 total time=   0.2s
    [CV 5/5] END gamma=0.001, learning_rate=0.001, max_depth=5, n_estimators=100, subsample=1.0;, score=0.943 total time=   0.2s
    [CV 1/5] END gamma=0.001, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.944 total time=   0.4s
    [CV 2/5] END gamma=0.001, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.889 total time=   0.3s
    [CV 3/5] END gamma=0.001, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.972 total time=   0.3s
    [CV 4/5] END gamma=0.001, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.971 total time=   0.4s
    [CV 5/5] END gamma=0.001, learning_rate=0.001, max_depth=5, n_estimators=200, subsample=1.0;, score=0.943 total time=   0.4s
    GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                         callbacks=None, colsample_bylevel=None,
                                         colsample_bynode=None,
                                         colsample_bytree=None,
                                         early_stopping_rounds=None,
                                         enable_categorical=False, eval_metric=None,
                                         feature_types=None, gamma=None,
                                         gpu_id=None, grow_policy=None,
                                         importance_type=None,
                                         interaction_constraints=None,
                                         learning_rate=None, max_b...
                                         max_cat_to_onehot=None,
                                         max_delta_step=None, max_depth=None,
                                         max_leaves=None, min_child_weight=None,
                                         missing=nan, monotone_constraints=None,
                                         n_estimators=100, n_jobs=None,
                                         num_parallel_tree=None, predictor=None,
                                         random_state=42, ...),
                 param_grid={'gamma': [0.1, 0.01, 0.001],
                             'learning_rate': [0.1, 0.01, 0.001],
                             'max_depth': [3, 5], 'n_estimators': [100, 200],
                             'subsample': [1.0]},
                 verbose=3)
 %% Cell type:code id:556e249c tags:
 ``` python
 # print best parameter after tuning
 print(grid.best_params_)
 # print how our model looks after hyper-parameter tuning
 print(grid.best_estimator_)
 ```
 %% Output
    {'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
    XGBClassifier(base_score=None, booster=None, callbacks=None,
                  colsample_bylevel=None, colsample_bynode=None,
                  colsample_bytree=None, early_stopping_rounds=None,
                  enable_categorical=False, eval_metric=None, feature_types=None,
                  gamma=0.1, gpu_id=None, grow_policy=None, importance_type=None,
                  interaction_constraints=None, learning_rate=0.1, max_bin=None,
                  max_cat_threshold=None, max_cat_to_onehot=None,
                  max_delta_step=None, max_depth=3, max_leaves=None,
                  min_child_weight=None, missing=nan, monotone_constraints=None,
                  n_estimators=100, n_jobs=None, num_parallel_tree=None,
                  predictor=None, random_state=42, ...)
 %% Cell type:code id:0686e808 tags:
 ``` python
 model_xgb = grid.best_estimator_
 model_xgb.fit(X_train,y_train)
 ```
 %% Output
    XGBClassifier(base_score=None, booster=None, callbacks=None,
                  colsample_bylevel=None, colsample_bynode=None,
                  colsample_bytree=None, early_stopping_rounds=None,
                  enable_categorical=False, eval_metric=None, feature_types=None,
                  gamma=0.1, gpu_id=None, grow_policy=None, importance_type=None,
                  interaction_constraints=None, learning_rate=0.1, max_bin=None,
                  max_cat_threshold=None, max_cat_to_onehot=None,
                  max_delta_step=None, max_depth=3, max_leaves=None,
                  min_child_weight=None, missing=nan, monotone_constraints=None,
                  n_estimators=100, n_jobs=None, num_parallel_tree=None,
                  predictor=None, random_state=42, ...)
 %% Cell type:code id:ac776bef tags:
 ``` python
 y_proba = model_xgb.fit(X_train, y_train).predict_proba(X_test)
 ```
 %% Cell type:markdown id:3ea57532 tags:
 # classification report
 %% Cell type:code id:18becbe2 tags:
 ``` python
 from sklearn.metrics import classification_report, confusion_matrix
 grid_predictions = grid.predict(X_test)
 print(classification_report(y_test, grid_predictions))
 ```
 %% Output
                  precision    recall  f1-score   support
               0       0.97      0.97      0.97        30
               1       0.97      0.97      0.97        30
        accuracy                           0.97        60
       macro avg       0.97      0.97      0.97        60
    weighted avg       0.97      0.97      0.97        60
 %% Cell type:code id:c0193b78 tags:
 ``` python
 classes = model_xgb.classes_
 ```
 %% Cell type:code id:d723c69f tags:
 ``` python
 classes
 ```
 %% Output
    array([0, 1])
 %% Cell type:code id:4643393d tags:
 ``` python
 #######CONFUSION MATRIX ###########
 from sklearn import metrics
 y_test_pred_xgb = model_xgb.predict(X_test)
 confusion_matrix_test = metrics.confusion_matrix(y_test, y_test_pred_xgb)
 cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix_test)
 cm_display.plot()
 plt.show()
 ```
 %% Output
 %% Cell type:code id:5ad4efb1 tags:
 ``` python
 total1=sum(sum(confusion_matrix_test))
 #####from confusion matrix calculate accuracy
 accuracy1=(confusion_matrix_test[0,0]+confusion_matrix_test[1,1])/total1
 print ('Accuracy : ', accuracy1)
 sensitivity1 = confusion_matrix_test[0,0]/(confusion_matrix_test[0,0]+confusion_matrix_test[0,1])
 print('Sensitivity : ', sensitivity1 )
 specificity1 = confusion_matrix_test[1,1]/(confusion_matrix_test[1,0]+confusion_matrix_test[1,1])
 print('Specificity : ', specificity1)
 ```
 %% Output
    Accuracy :  0.9666666666666667
    Sensitivity :  0.9666666666666667
    Specificity :  0.9666666666666667
 %% Cell type:markdown id:6603d82c tags:
 # ROC curve
 %% Cell type:code id:0e2a2694 tags:
 ``` python
 from sklearn.model_selection import StratifiedKFold
 from sklearn.feature_selection import SelectKBest, f_classif
 from sklearn.metrics import auc
 def roc(X_train,y_train,model,label):
    cv = StratifiedKFold(n_splits=6)
    classifier = model
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    fig, ax = plt.subplots(figsize=(6, 6))
    for fold, (train, test) in enumerate(cv.split(X_train, y_train)):
        classifier.fit(X_train.iloc[train], y_train.iloc[train])
        viz = RocCurveDisplay.from_estimator(
            classifier,
            X_train.iloc[test],
            y_train.iloc[test],
            name=f"ROC fold {fold}",
            alpha=0.3,
            lw=1,
            ax=ax,
        )
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)
    ax.plot([0, 1], [0, 1], "k--", label="chance level (AUC = 0.5)")
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(
        mean_fpr,
        mean_tpr,
        color="b",
        label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
        lw=2,
        alpha=0.8,
    )
    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(
        mean_fpr,
        tprs_lower,
        tprs_upper,
        color="grey",
        alpha=0.2,
        label=r"$\pm$ 1 std. dev.",
    )
    ax.set(
        xlim=[-0.05, 1.05],
        ylim=[-0.05, 1.05],
        xlabel="False Positive Rate",
        ylabel="True Positive Rate",
        title=label,
    )
    ax.axis("square")
    ax.legend(loc="lower right")
    plt.show()
 ```
 %% Cell type:code id:d4cc8e6d tags:
 ``` python
 model = model_xgb
 label="ROC curve of training data"
 roc(X_train,y_train,model,label)
 ```
 %% Output
 %% Cell type:code id:1199e2e4 tags:
 ``` python
 label="ROC curve of testing data"
 roc(X_test,y_test,model,label)
 ```
 %% Output
 %% Cell type:markdown id:bee03388 tags:
 # Feature importance
 %% Cell type:code id:6688e037 tags:
 ``` python
 # for important features:
 important_feat = model_xgb.feature_importances_
 #get indices of those important features
 idx = important_feat.argsort(kind= "quicksort")
 idx= idx[::-1][:50]
 ```
 %% Cell type:code id:4e6a7ea1 tags:
 ``` python
 idx
 ```
 %% Output
    array([ 66,  65,  84,  94, 140,  32, 169, 137,  23, 212,  10, 166,  13,
            36,  56, 126,  48,  57,  42, 208,  37, 113,  29, 160,  22,  96,
           162, 229, 189, 101, 104, 127, 135,  21,  79,  78,  77,  76,  75,
            74,  73,  72, 202,  71,  69,  68,  67,  64,  63,  62])
 %% Cell type:code id:f2101fe1 tags:
 ``` python
 df1 = X.T
 ```
 %% Cell type:code id:2cbf1166 tags:
 ``` python
 top_met = df1.iloc[idx]
 ```
 %% Cell type:code id:2370b2df tags:
 ``` python
 top_met.index
 ```
 %% Output
    Index(['hsa-miR-18b-5p', 'hsa-miR-18a-5p', 'hsa-miR-21-5p', 'hsa-miR-25-3p',
           'hsa-miR-424-5p', 'hsa-miR-130b-3p', 'hsa-miR-455-3p', 'hsa-miR-378i',
           'hsa-miR-1268a', 'hsa-miR-93-5p', 'hsa-miR-106b-5p', 'hsa-miR-451a',
           'hsa-miR-10b-5p', 'hsa-miR-140-3p', 'hsa-miR-15b-5p', 'hsa-miR-3651',
           'hsa-miR-150-5p', 'hsa-miR-16-2-3p', 'hsa-miR-145-5p', 'hsa-miR-7-5p',
           'hsa-miR-140-5p', 'hsa-miR-3198', 'hsa-miR-1290', 'hsa-miR-4465',
           'hsa-miR-126-3p', 'hsa-miR-26b-5p', 'hsa-miR-4497', 'mr_1',
           'hsa-miR-497-5p', 'hsa-miR-29c-3p', 'hsa-miR-30a-5p', 'hsa-miR-3656',
           'hsa-miR-378a-3p', 'hsa-miR-125b-5p', 'hsa-miR-200c-3p',
           'hsa-miR-200b-3p', 'hsa-miR-19b-3p', 'hsa-miR-19a-3p',
           'hsa-miR-199a-5p', 'hsa-miR-199a-3p', 'hsa-miR-1973', 'hsa-miR-197-5p',
           'hsa-miR-642a-3p', 'hsa-miR-197-3p', 'hsa-miR-193b-3p',
           'hsa-miR-193a-5p', 'hsa-miR-1915-3p', 'hsa-miR-188-5p',
           'hsa-miR-185-5p', 'hsa-miR-181b-5p'],
          dtype='object')