added code for feature selection

3f2b491f · Anuraj Suman · 823f5bb1 · 3f2b491f
Commit 3f2b491f authored May 5, 2023 by Anuraj Suman
--- a/PCLR/PCLR_MTBLS92_Working_Copy.ipynb
+++ b/PCLR/PCLR_MTBLS92_Working_Copy.ipynb
@@ -1606,179 +1606,32 @@
    {
      "cell_type": "code",
      "source": [
-        "coef = model.model.coef_\n",
-        "important_feat = abs(coef)\n",
-        "#get indices of those important features\n",
-        "idx = important_feat.argsort(kind= \"quicksort\")\n",
-        "idx= idx[::-1][:5]\n",
-        "#get more information from peaktable:\n",
-        "top_met = PeakList.iloc[idx]\n",
-        "top_met_info = PeakTable.iloc[top_met.index-1]\n",
-        "#5 most important metabolites\n",
-        "top_met_info"
+        "coefficients = model.model.coef_\n",
+        "top_features = abs(coefficients)\n",
+        "ids = top_features.argsort(kind= \"quicksort\")\n",
+        "ids = ids[::-1][:3] # extracting top 3 IDs\n",
+        "top_three_features = PeakList.iloc[ids]\n",
+        "top_three_features_info = PeakTable.iloc[top_three_features.index-1]\n",
+        "print(top_three_features_info)"
      ],
      "metadata": {
        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 206
+          "base_uri": "https://localhost:8080/"
        },
        "id": "KtMTPUQelblb",
-        "outputId": "555d795d-4803-4329-c4e2-83cf2b4dca03"
+        "outputId": "a7db8b73-53e2-4dca-b416-450c5e4ecff8"
      },
-      "execution_count": 9,
+      "execution_count": 10,
      "outputs": [
        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
            "    Idx Name         Label\n",
            "6     6   M6  LysoPC(18:0)\n",
            "42   42  M42     PC(38:2e)\n",
-              "8     8   M8  LysoPC(18:2)\n",
-              "16   16  M16      PC(32:0)\n",
-              "53   53  M53     PC(40:4e)"
-            ],
-            "text/html": [
-              "\n",
-              "  <div id=\"df-79be1546-9158-4080-a131-8d218476c06e\">\n",
-              "    <div class=\"colab-df-container\">\n",
-              "      <div>\n",
-              "<style scoped>\n",
-              "    .dataframe tbody tr th:only-of-type {\n",
-              "        vertical-align: middle;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>Idx</th>\n",
-              "      <th>Name</th>\n",
-              "      <th>Label</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>6</th>\n",
-              "      <td>6</td>\n",
-              "      <td>M6</td>\n",
-              "      <td>LysoPC(18:0)</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>42</th>\n",
-              "      <td>42</td>\n",
-              "      <td>M42</td>\n",
-              "      <td>PC(38:2e)</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>8</th>\n",
-              "      <td>8</td>\n",
-              "      <td>M8</td>\n",
-              "      <td>LysoPC(18:2)</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>16</th>\n",
-              "      <td>16</td>\n",
-              "      <td>M16</td>\n",
-              "      <td>PC(32:0)</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>53</th>\n",
-              "      <td>53</td>\n",
-              "      <td>M53</td>\n",
-              "      <td>PC(40:4e)</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>\n",
-              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-79be1546-9158-4080-a131-8d218476c06e')\"\n",
-              "              title=\"Convert this dataframe to an interactive table.\"\n",
-              "              style=\"display:none;\">\n",
-              "        \n",
-              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
-              "       width=\"24px\">\n",
-              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
-              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
-              "  </svg>\n",
-              "      </button>\n",
-              "      \n",
-              "  <style>\n",
-              "    .colab-df-container {\n",
-              "      display:flex;\n",
-              "      flex-wrap:wrap;\n",
-              "      gap: 12px;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-convert {\n",
-              "      background-color: #E8F0FE;\n",
-              "      border: none;\n",
-              "      border-radius: 50%;\n",
-              "      cursor: pointer;\n",
-              "      display: none;\n",
-              "      fill: #1967D2;\n",
-              "      height: 32px;\n",
-              "      padding: 0 0 0 0;\n",
-              "      width: 32px;\n",
-              "    }\n",
-              "\n",
-              "    .colab-df-convert:hover {\n",
-              "      background-color: #E2EBFA;\n",
-              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
-              "      fill: #174EA6;\n",
-              "    }\n",
-              "\n",
-              "    [theme=dark] .colab-df-convert {\n",
-              "      background-color: #3B4455;\n",
-              "      fill: #D2E3FC;\n",
-              "    }\n",
-              "\n",
-              "    [theme=dark] .colab-df-convert:hover {\n",
-              "      background-color: #434B5C;\n",
-              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
-              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
-              "      fill: #FFFFFF;\n",
-              "    }\n",
-              "  </style>\n",
-              "\n",
-              "      <script>\n",
-              "        const buttonEl =\n",
-              "          document.querySelector('#df-79be1546-9158-4080-a131-8d218476c06e button.colab-df-convert');\n",
-              "        buttonEl.style.display =\n",
-              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
-              "\n",
-              "        async function convertToInteractive(key) {\n",
-              "          const element = document.querySelector('#df-79be1546-9158-4080-a131-8d218476c06e');\n",
-              "          const dataTable =\n",
-              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
-              "                                                     [key], {});\n",
-              "          if (!dataTable) return;\n",
-              "\n",
-              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
-              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
-              "            + ' to learn more about interactive tables.';\n",
-              "          element.innerHTML = '';\n",
-              "          dataTable['output_type'] = 'display_data';\n",
-              "          await google.colab.output.renderOutput(dataTable, element);\n",
-              "          const docLink = document.createElement('div');\n",
-              "          docLink.innerHTML = docLinkHtml;\n",
-              "          element.appendChild(docLink);\n",
-              "        }\n",
-              "      </script>\n",
-              "    </div>\n",
-              "  </div>\n",
-              "  "
+            "8     8   M8  LysoPC(18:2)\n"
          ]
-          },
-          "metadata": {},
-          "execution_count": 9
        }
      ]
    }

 %% Cell type:markdown id: tags:
  
 To begin: Click anywhere in this cell and press Run on the menu bar. This executes the current cell and then highlights the next cell. There are two types of cells. A text cell and a code cell. When you Run a text cell (we are in a text cell now), you advance to the next cell without executing any code. When you Run a code cell (identified by In[ ]: to the left of the cell) you advance to the next cell after executing all the Python code within that cell. Any visual results produced by the code (text/figures) are reported directly below that cell. Press Run again. Repeat this process until the end of the notebook. NOTE: All the cells in this notebook can be automatically executed sequentially by clicking Kernel→Restart and Run All. Should anything crash then restart the Jupyter Kernal by clicking Kernel→Restart, and start again from the top.
  
 %% Cell type:markdown id: tags:
  
 # **PCLR_MTBLS92**
  
  
  
 The study used in this tutorial has been previously published by Hilvo et al. (2013), and the deconvolved and annotated data file deposited at the Metabolights data repository. The data can be accessed directly via its study ID: MTBLS92. This workflow requires data to be formatted as a Microsoft Excel file, using the Tidy Data framework (i.e. each column is a variable, and row is an observation). As such, the Excel file contains a Data Sheet and Peak Sheet. The Data Sheet contains all the metabolite concentrations and metadata associated with each observation (requiring the inclusion of the columns: Idx, SampleID, and Class). The Peak Sheet contains all the metadata pertaining to each measured metabolite (requiring the inclusion of the columns: Idx, Name, and Label). Please inspect the Excel file MTBLS92.xlsx used in this workflow before proceeding.
  
 This is a plasma LC-MS dataset consisting of 138 named metabolites. The primary outcome for this paper was before and after neoadjuvant chemotherapy in breast cancer patients. For the purpose of this study, we compare before (Class=1; n=142) and after (Class=0; n=111) neoadjuvant chemotherapy in a binary discriminant analysis
  
 %% Cell type:markdown id: tags:
  
 # **PCLR Workflow**
  
 This Jupyter Notebook implements the complete workflow for creating, optimising, and evaluating a principal component logistic regression (PCLR) model. PCLR is a two-stage algorithm combining principal component analysis (PCA) and logistic regression, where the first N principal component scores act as the independent variables of the logistic regression, and the binary classification is the dependent variable. The value of N is chosen by the user. PCA was implemented using PCA and Logistic Regression from scikit-learn.
  
 Please refer to the 'cimcb' package documentation for further details regarding this specific implementation: https://cimcb.github.io/cimcb
  
 PCLR uses the following Hyperparameter(s):
 n_components: number of principal components projected into the logistic regression (default = 2)
 The purpose of each hyperparameter is explained here: Aguilera and Escabias (2000)
  
 The notebook workflow is broken into the following steps:
  
  
 1.   **Import Packages**: First, the Python packages required for this workflow need to be imported (numpy, pandas, and cimcb).
 2.   **Load Data & Peak Sheet**: From the Excel spreadsheet, import the Data and Peak spreadsheets and create two respective Pandas tables: DataTable and PeakTable.
 3.   **Extract X & Y**: Next, we reduce the data in DataTable to include only those observations needed for the binary comparison and create a new table: DataTable2. We define one column of the data table to be the "outcome" variable Outcomes, and convert the class labels in this column to a binary outcome vector Y, where 1 is the positive outcome, and 0 the negative outcome (eg. case=1 & control=0). A new variable peaklist is created to hold the names (M1...Mn) of the metabolites to be used in the discriminant analysis. To create an independent dataset to evaluate, scikit-learn module's train_test_split() function is used. The data is split into 2/3rd training (DataTrain and YTrain), and 1/3rd test (DataTest and YTest). The metabolite data corresponding to peaklist is extracted from DataTrain and placed in a matrix XTrain. The XTrain matrix is log-transformed and auto-scaled, with missing values imputed using k-nearest neighbours (k=3). Then the metabolite data corresponding to peaklist is extracted from DataTest and placed in a matrix XTest. The XTest matrix is log-transformed and auto-scaled (using mu and sigma from XTrain), with missing values imputed using k-nearest neighbours (k=3)
 4.   **Hyperparameter Optimisation**: Here, we use the helper function cb.cross_val.KFold() to carry out 5-fold cross-validation of a set of PCLR models configured with different numbers of principal components (1 to 30). This helper function is generally applicable, and the values being passed to it are:
 *   The class of model to be created by the function, cb.model.PCLR.
 *   The metabolite matrix, XTknn, and binary outcome vector, Y.
 *   A dictionary, param_dict, describing key:value pairs where the key is a parameter that is passed to the model, and the value is a list of values to be passed to that parameter.
 *   The number of folds in the cross-validation, folds, and the number of Monte Carlo repetitions of the k-fold CV, n_mc.
  
 When cv.run() followed by cv.plot(metric='r2q2') are run the results are displayed as 2 plots of
 and
 statistics: (a) the difference (
 ) vs.
 , and (b) absolute values of both
 and
 against the number of components. If the function cv.plot(metric='auc') is run the predictive ability of the model is presented as measures of the area under the ROC curve,
 &
 , as a nonparametric alternative to
 &
 . These multiple plots are used to aid in selecting the optimal number of components.
  
 5.   **Build Model & Evaluate**: Here, we use the class cb.model.PCLR() to building a PCLR model using the optimal hyperparameter values determined in step 4. The model is trained on the training dataset, XTrainKnn, and tested on the independent test dataset, XTestKnn. Next, the trained model's .evaluate() method is used to visualise model performance for both the training and independent test dataset using: a violin plot showing the distributions of negative and positive responses as violin and box-whisker plots; a probability density function plot for each response type, and a ROC curve that displays the curve for the training dataset (green) and test dataset (yellow).
 6.   **Bootstrap Evaluation**: Finally, to create an estimate of the robustness and a measure of generalised predictive ability of this model we perform bootstrap aggregation (Bagging) using the helper function cb.bootstrap.Per() with 100 boostrapped models. This generates a population of 100 model predictions for both the training set (in-bag prediction - IB) and the holdout test set (out-of-bag - OOB) from the full dataset, with the metabolite matrix, XBootKnn, and binary outcome vector, Y. These predictions are visualised with a box-violin and probability density function plot for the aggregate model. The ROC curve displays the curve for the training dataset (green) and test dataset (yellow) from section 5 with 95% confidence intervals (light green band = IB & light yellow band = OOB).
 7.   **Export Results**: Exporting the model evaluation results as an Excel spreadsheet.
  
  
  
  
  
  
  
  
 %% Cell type:markdown id: tags:
  
 # **1. Import Packages**
  
 %% Cell type:code id: tags:
  
 ``` 
 !pip install https://github.com/KevinMMendez/cimcb/archive/master.zip
 ```
  
 %% Output
  
    Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
    Collecting https://github.com/KevinMMendez/cimcb/archive/master.zip
      Downloading https://github.com/KevinMMendez/cimcb/archive/master.zip
    [2K     [32m\[0m [32m486.2 kB[0m [31m8.4 MB/s[0m [33m0:00:00[0m
    [?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
    Requirement already satisfied: bokeh>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from cimcb==2.1.0) (2.4.3)
    Requirement already satisfied: keras>=2.2.4 in /usr/local/lib/python3.10/dist-packages (from cimcb==2.1.0) (2.12.0)
    Requirement already satisfied: numpy>=1.12 in /usr/local/lib/python3.10/dist-packages (from cimcb==2.1.0) (1.22.4)
    Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from cimcb==2.1.0) (1.5.3)
    Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from cimcb==2.1.0) (1.10.1)
    Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from cimcb==2.1.0) (1.2.2)
    Requirement already satisfied: statsmodels in /usr/local/lib/python3.10/dist-packages (from cimcb==2.1.0) (0.13.5)
    Collecting theano
      Downloading Theano-1.0.5.tar.gz (2.8 MB)
    [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
    [?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
    Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from cimcb==2.1.0) (4.65.0)
    Requirement already satisfied: xlrd in /usr/local/lib/python3.10/dist-packages (from cimcb==2.1.0) (2.0.1)
    Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from cimcb==2.1.0) (1.2.0)
    Requirement already satisfied: tornado>=5.1 in /usr/local/lib/python3.10/dist-packages (from bokeh>=1.0.0->cimcb==2.1.0) (6.2)
    Requirement already satisfied: packaging>=16.8 in /usr/local/lib/python3.10/dist-packages (from bokeh>=1.0.0->cimcb==2.1.0) (23.1)
    Requirement already satisfied: Jinja2>=2.9 in /usr/local/lib/python3.10/dist-packages (from bokeh>=1.0.0->cimcb==2.1.0) (3.1.2)
    Requirement already satisfied: pillow>=7.1.0 in /usr/local/lib/python3.10/dist-packages (from bokeh>=1.0.0->cimcb==2.1.0) (8.4.0)
    Requirement already satisfied: typing-extensions>=3.10.0 in /usr/local/lib/python3.10/dist-packages (from bokeh>=1.0.0->cimcb==2.1.0) (4.5.0)
    Requirement already satisfied: PyYAML>=3.10 in /usr/local/lib/python3.10/dist-packages (from bokeh>=1.0.0->cimcb==2.1.0) (6.0)
    Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->cimcb==2.1.0) (2022.7.1)
    Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->cimcb==2.1.0) (2.8.2)
    Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->cimcb==2.1.0) (3.1.0)
    Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.10/dist-packages (from statsmodels->cimcb==2.1.0) (0.5.3)
    Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from theano->cimcb==2.1.0) (1.16.0)
    Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from Jinja2>=2.9->bokeh>=1.0.0->cimcb==2.1.0) (2.1.2)
    Building wheels for collected packages: cimcb, theano
      Building wheel for cimcb (setup.py) ... [?25l[?25hdone
      Created wheel for cimcb: filename=cimcb-2.1.0-py3-none-any.whl size=166785 sha256=bccbd41551001eb878ba584f333c1d05b467e2ea145be2b8b02622052edd396e
      Stored in directory: /tmp/pip-ephem-wheel-cache-am7fem44/wheels/7b/86/72/e33231802da24f264f585c8e6cc4c045a6fc4a903b09cec5c2
      Building wheel for theano (setup.py) ... [?25l[?25hdone
      Created wheel for theano: filename=Theano-1.0.5-py3-none-any.whl size=2668109 sha256=c8097f2c431f446d237d345296056404f9be2e254e7485c6f35007ebe6242b6d
      Stored in directory: /root/.cache/pip/wheels/d9/e6/7d/2267d21a99e4ab8276f976f293b4ff23f50c9d809f4a216ebb
    Successfully built cimcb theano
    Installing collected packages: theano, cimcb
    Successfully installed cimcb-2.1.0 theano-1.0.5
  
 %% Cell type:code id: tags:
  
 ``` 
 import numpy as np
 import pandas as pd
 import cimcb as cb
 from sklearn.model_selection import train_test_split
  
 print('All packages successfully loaded')
 ```
  
 %% Output
  
    All packages successfully loaded
  
 %% Cell type:markdown id: tags:
  
 # **2. Load Data & Peak Sheet**
  
 %% Cell type:code id: tags:
  
 ``` 
 file = 'MTBLS92.xlsx'
  
 DataTable, PeakTable = cb.utils.load_dataXL(file, DataSheet='Data', PeakSheet='Peak')
 ```
  
 %% Output
  
    Loadings PeakFile: Peak
    Loadings DataFile: Data
    Data Table & Peak Table is suitable.
    TOTAL SAMPLES: 447 TOTAL PEAKS: 138
    Done!
  
 %% Cell type:markdown id: tags:
  
 # **3. Extract X & Y**
  
 %% Cell type:code id: tags:
  
 ``` 
 # Extract PeakList
 PeakList = PeakTable['Name']
  
 # Select Subset of Data
 DataTable2 = DataTable[(DataTable.Class == 1) | (DataTable.Class == 0)]
  
 # Create a Binary Y Vector
 Outcomes = DataTable2['Class']
 Y = Outcomes.values
  
 # Split Data into Train (2/3) and Test (1/3)
 DataTrain, DataTest, YTrain, YTest = train_test_split(DataTable2, Y, test_size=1/3, stratify=Y, random_state=11)
  
 # Extract Train Data
 XTrain = DataTrain[PeakList]
 XTrainLog = np.log(XTrain)
 print(XTrainLog.shape)
 XTrainScale, mu, sigma = cb.utils.scale(XTrainLog, method='auto', return_mu_sigma=True)
  
 XTrainKnn = cb.utils.knnimpute(XTrainScale, k=3)
  
 # Extract Test Data
 XTest = DataTest[PeakList]
 XTestLog = np.log(XTest)
 XTestScale = cb.utils.scale(XTestLog, method='auto', mu=mu, sigma=sigma)
 XTestKnn = cb.utils.knnimpute(XTestScale, k=3)
 ```
  
 %% Output
  
    (168, 138)
  
 %% Cell type:markdown id: tags:
  
 # **4. Hyperparameter Optimisation**
  
 %% Cell type:code id: tags:
  
 ``` 
 # Parameter Dictionary
 param_dict = {'n_components': [1,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80]}
  
 # Initialise
 cv = cb.cross_val.KFold(model=cb.model.PCLR,
                        X=XTrainKnn,
                        Y=YTrain,
                        param_dict=param_dict,
                        folds=5,
                        n_mc=10)
  
 # Run and Plot
 cv.run()
 cv.plot(metric='auc')
 cv.plot(metric='r2q2')
 ```
  
 %% Output
  
    Number of cores set to: 2
    Running ...
  
    1/2: 100%|██████████| 41/41 [00:13<00:00,  3.04it/s]
    2/2: 100%|██████████| 410/410 [00:30<00:00, 13.50it/s]
  
    Time taken: 0.74 minutes with 2 cores
    Done!
  
    /usr/local/lib/python3.10/dist-packages/cimcb/cross_val/BaseCrossVal.py:559: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
      for key, values in full_std.iteritems():
    /usr/local/lib/python3.10/dist-packages/cimcb/cross_val/BaseCrossVal.py:564: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
      for key, values in cv_std.iteritems():
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
  
  
  
  
    /usr/local/lib/python3.10/dist-packages/cimcb/cross_val/BaseCrossVal.py:559: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
      for key, values in full_std.iteritems():
    /usr/local/lib/python3.10/dist-packages/cimcb/cross_val/BaseCrossVal.py:564: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
      for key, values in cv_std.iteritems():
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
  
  
  
 %% Cell type:markdown id: tags:
  
 # **5. Build Model & Evaluate**
  
 %% Cell type:code id: tags:
  
 ``` 
 # Build Model
 model = cb.model.PCLR(n_components=48)
 YPredTrain = model.train(XTrainKnn, YTrain)
 YPredTest = model.test(XTestKnn)
  
 # Put YTrain and YPredTrain in a List
 EvalTrain = [YTrain, YPredTrain]
  
 # Put YTest and YPrestTest in a List
 EvalTest = [YTest, YPredTest]
  
 # Evaluate Model (include Test Dataset)
 model.evaluate(testset=EvalTest)
 ```
  
 %% Output
  
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
    /usr/local/lib/python3.10/dist-packages/cimcb/model/BaseModel.py:367: SettingWithCopyWarning:
    A value is trying to be set on a copy of a slice from a DataFrame
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
      self.table[i][0] = np.round(self.table[i][0], 2)
    /usr/local/lib/python3.10/dist-packages/cimcb/model/BaseModel.py:368: SettingWithCopyWarning:
    A value is trying to be set on a copy of a slice from a DataFrame
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
      self.table[i][1] = np.round(self.table[i][1], 2)
    /usr/local/lib/python3.10/dist-packages/cimcb/model/BaseModel.py:372: SettingWithCopyWarning:
    A value is trying to be set on a copy of a slice from a DataFrame
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
      self.table[i][2] = "%0.2e" % self.table[i][2]
    BokehDeprecationWarning: 'WidgetBox' is deprecated and will be removed in Bokeh 3.0, use 'bokeh.models.Column' instead
  
  
  
  
 %% Cell type:markdown id: tags:
  
 # **6. Bootstrap Evaluation**
  
 %% Cell type:code id: tags:
  
 ``` 
 # Extract X Data
 XBoot = DataTable2[PeakList]
 XBootLog = np.log(XBoot)
 XBootScale = cb.utils.scale(XBootLog, method='auto')
 XBootKnn = cb.utils.knnimpute(XBootScale, k=3)
 YPredBoot = model.train(XBootKnn, Y)
  
 # Build Boostrap Models
 bootmodel = cb.bootstrap.Per(model, bootnum=100)
 bootmodel.run()
  
 # Boostrap Evaluate Model (include Test Dataset)
 bootmodel.evaluate(trainset=EvalTrain, testset=EvalTest)
 ```
  
 %% Output
  
    Number of cores set to: 2
  
    100%|██████████| 100/100 [00:02<00:00, 45.33it/s]
  
    Time taken: 0.04 minutes with 2 cores
  
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
    BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
    /usr/local/lib/python3.10/dist-packages/cimcb/bootstrap/BaseBootstrap.py:343: SettingWithCopyWarning:
    A value is trying to be set on a copy of a slice from a DataFrame
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
      self.table[i][0] = np.round(self.table[i][0], 2)
    /usr/local/lib/python3.10/dist-packages/cimcb/bootstrap/BaseBootstrap.py:344: SettingWithCopyWarning:
    A value is trying to be set on a copy of a slice from a DataFrame
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
      self.table[i][1] = np.round(self.table[i][1], 2)
    /usr/local/lib/python3.10/dist-packages/cimcb/bootstrap/BaseBootstrap.py:348: SettingWithCopyWarning:
    A value is trying to be set on a copy of a slice from a DataFrame
    
    See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
      self.table[i][2] = "%0.2e" % self.table[i][2]
    BokehDeprecationWarning: 'WidgetBox' is deprecated and will be removed in Bokeh 3.0, use 'bokeh.models.Column' instead
  
  
  
  
 %% Cell type:markdown id: tags:
  
 # **7. Export Results**
  
 %% Cell type:code id: tags:
  
 ``` 
 file = 'PCLR_MTBLS92.xlsx'
  
 bootmodel.save_results(file)
 ```
  
 %% Output
  
    Done! Saved results as PCLR_MTBLS92.xlsx
  
 %% Cell type:code id: tags:
  
 ``` 
-coef = model.model.coef_
-important_feat = abs(coef)
-#get indices of those important features
-idx = important_feat.argsort(kind= "quicksort")
-idx= idx[::-1][:5]
-#get more information from peaktable:
-top_met = PeakList.iloc[idx]
-top_met_info = PeakTable.iloc[top_met.index-1]
-#5 most important metabolites
-top_met_info
+coefficients = model.model.coef_
+top_features = abs(coefficients)
+ids = top_features.argsort(kind= "quicksort")
+ids = ids[::-1][:3] # extracting top 3 IDs
+top_three_features = PeakList.iloc[ids]
+top_three_features_info = PeakTable.iloc[top_three_features.index-1]
+print(top_three_features_info)
 ```
  
 %% Output
  
        Idx Name         Label
    6     6   M6  LysoPC(18:0)
    42   42  M42     PC(38:2e)
    8     8   M8  LysoPC(18:2)
-    16   16  M16      PC(32:0)
-    53   53  M53     PC(40:4e)