Skip to content
Snippets Groups Projects
Commit 1d57dfa3 authored by aakan96's avatar aakan96
Browse files

Update file mRNA_TCGA_limma_dataset_svm_final.ipynb

parent c9dfc4c1
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id:4994dbd6 tags:
``` python
## TRAINING SET : MERGED DATA , TESTING DATA: TCGA mRNA data
```
%% Cell type:code id:f097ad55 tags:
``` python
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import RocCurveDisplay
import matplotlib.pyplot as plt
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import Lasso
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
np.random.seed(7)
```
%% Cell type:markdown id:73b6611a tags:
# Data Preprocessing
%% Cell type:code id:0eeb7a35 tags:
``` python
df_train = pd.read_csv("DS/mRNA_DS_preprocessed_training_data.csv")
```
%% Cell type:code id:a6ab23aa tags:
``` python
df_test = pd.read_csv("DS/mRNA_TCGA_DS_test_data.csv")
```
%% Cell type:code id:683b63ce tags:
``` python
df_train = df_train.T
df_test = df_test.T
```
%% Cell type:code id:a701e30d tags:
``` python
#df_test = df_test[:-1]
```
%% Cell type:code id:77d974be tags:
``` python
#Transform the input data
df_train.rename(columns=df_train.iloc[0], inplace = True)
df_train.drop(df_train.index[0], inplace = True)
df_train=df_train.reset_index()
```
%% Cell type:code id:610fbe2d tags:
``` python
df_train=df_train.drop(['ACPP'], axis=1)
```
%% Cell type:code id:2e78017d tags:
``` python
#Transform the input data
df_test.rename(columns=df_test.iloc[0], inplace = True)
df_test.drop(df_test.index[0], inplace = True)
df_test=df_test.reset_index()
```
%% Cell type:code id:ea60801d tags:
``` python
metadata_test = pd.read_csv("DS/mRNA_TCGA_DS_col_data.csv")
```
%% Cell type:code id:58d531b9 tags:
``` python
df_test= df_test.merge(metadata_test, left_on="index", right_on= "Unnamed: 0")
```
%% Cell type:code id:5eeaaa39 tags:
``` python
df_test= df_test.rename(columns={"group_assignments": "title0"})
```
%% Cell type:code id:7910f2fa tags:
``` python
df_test['title0'] = df_test['title0'].replace('(?i)Control|mucosa|normal|healthy', 0, regex=True)
```
%% Cell type:code id:c102e10e tags:
``` python
df_test['title0'] = df_test['title0'].replace('(?i)Tumor|Cancer|carcinoma', 1, regex=True)
```
%% Cell type:code id:ed00d2ce tags:
``` python
```
%% Cell type:code id:6c255d2e tags:
``` python
df_test['title0'].value_counts()
```
%% Output
title0
1 184
0 13
Name: count, dtype: int64
%% Cell type:code id:636b44ab tags:
``` python
df_test = df_test[pd.to_numeric(df_test['title0'], errors='coerce').notnull()]#remove all non-numeric data from the column.
```
%% Cell type:code id:34896f9a tags:
``` python
df_test= df_test.drop(['index', 'Unnamed: 0'], axis=1)
```
%% Cell type:code id:359b5bab tags:
``` python
df_test= df_test.rename(columns={"title0": "index"})
```
%% Cell type:code id:e8befa4e tags:
``` python
# Select 13 rows of class 0
class_0 = df_test[df_test['index'] == 0].head(13)
# Select 13 rows of class 1
class_1 = df_test[df_test['index'] == 1].head(27)
# Concatenate the selected rows
df_test = pd.concat([class_0, class_1])
```
%% Cell type:code id:fc9ca29c tags:
``` python
df_test['index'].value_counts()
```
%% Output
index
1 27
0 13
Name: count, dtype: int64
%% Cell type:code id:dc14bb1c tags:
``` python
X_test=df_test.drop("index",axis=1)
y_test=df_test['index']
```
%% Cell type:code id:4c50c510 tags:
``` python
metadata_train = pd.read_csv("DS/mRNA_DS_metadata_col_info.csv")
```
%% Cell type:code id:6730cf89 tags:
``` python
df_train= df_train.merge(metadata_train, left_on="index", right_on= "Unnamed: 0")
```
%% Cell type:code id:7a8ad8ad tags:
``` python
df_train['title0'] = df_train['title0'].replace('(?i)mucosa|normal|healthy', 0, regex=True)
```
%% Cell type:code id:a8cf8643 tags:
``` python
df_train['title0'] = df_train['title0'].replace('(?i)Tumor|Cancer|carcinoma', 1, regex=True)
```
%% Cell type:code id:f5d203aa tags:
``` python
df_train = df_train[pd.to_numeric(df_train['title0'], errors='coerce').notnull()]#remove all non-numeric data from the column.
```
%% Cell type:code id:523bdaa6 tags:
``` python
df_train= df_train.drop(['index', 'Unnamed: 0'], axis=1)
```
%% Cell type:code id:46a6fb36 tags:
``` python
df_train= df_train.rename(columns={"title0": "index"})
```
%% Cell type:code id:fbaf2507 tags:
``` python
df_train= df_train.apply(pd.to_numeric)
```
%% Cell type:code id:776cfbee tags:
``` python
df_train['index'].value_counts()
```
%% Output
index
0 111
1 108
Name: count, dtype: int64
%% Cell type:code id:8c0011ea tags:
``` python
X=df_train.drop("index",axis=1)
y=df_train['index']
```
%% Cell type:code id:fc606979 tags:
``` python
X=X.astype('int')
```
%% Cell type:code id:93e28118 tags:
``` python
y=y.astype('int')
```
%% Cell type:markdown id:e9830b6c tags:
# Feature Selection
%% Cell type:code id:1cc528fb tags:
``` python
# LASSO model:
lasso = Lasso(alpha=1)
# fitting the model:
lasso.fit(X, y)
# select all coefficients and the feature names
lasso_coefs = lasso.coef_
feature_names = X.columns
# collect the selected features:
selected_feature_indices = np.nonzero(lasso_coefs)[0]
selected_features = [feature_names[i] for i in selected_feature_indices]
X_selected = X.iloc[:, selected_feature_indices]
```
%% Cell type:code id:8afa29ae tags:
``` python
len(selected_features)
```
%% Output
98
%% Cell type:markdown id:6cee6462 tags:
# Test train split
%% Cell type:code id:3af09ef8 tags:
``` python
X_train = X_selected
y_train = y
```
%% Cell type:code id:129430e6 tags:
``` python
y_test.value_counts(),y_train.value_counts()
```
%% Output
(index
1 27
0 13
Name: count, dtype: int64,
index
0 111
1 108
Name: count, dtype: int64)
%% Cell type:markdown id:1cfe2a06 tags:
# Cross validation
%% Cell type:code id:1fbca4b8 tags:
``` python
from sklearn.svm import SVC
# we can add class_weight='balanced' to add panalize mistake
svm_model = SVC(kernel = "linear", probability=True,random_state=47)
```
%% Cell type:code id:0502e118 tags:
``` python
# Defining parameter range
param_grid = {
'C': [0.0005,0.0001,0.001,0.1]
}
```
%% Cell type:code id:7f2d18b0 tags:
``` python
grid = GridSearchCV(svm_model, param_grid, refit=True, verbose=3)
```
%% Cell type:code id:79790f1d tags:
``` python
# Fitting the model for grid search
grid.fit(X_train, y_train)
```
%% Output
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END ..........................C=0.0005;, score=0.955 total time= 0.0s
[CV 2/5] END ..........................C=0.0005;, score=0.932 total time= 0.0s
[CV 3/5] END ..........................C=0.0005;, score=0.955 total time= 0.0s
[CV 4/5] END ..........................C=0.0005;, score=0.932 total time= 0.0s
[CV 5/5] END ..........................C=0.0005;, score=0.884 total time= 0.0s
[CV 1/5] END ..........................C=0.0001;, score=0.932 total time= 0.0s
[CV 2/5] END ..........................C=0.0001;, score=0.795 total time= 0.0s
[CV 3/5] END ..........................C=0.0001;, score=0.955 total time= 0.0s
[CV 4/5] END ..........................C=0.0001;, score=0.886 total time= 0.0s
[CV 5/5] END ..........................C=0.0001;, score=0.860 total time= 0.0s
[CV 1/5] END ...........................C=0.001;, score=0.977 total time= 0.0s
[CV 2/5] END ...........................C=0.001;, score=0.977 total time= 0.0s
[CV 3/5] END ...........................C=0.001;, score=0.977 total time= 0.0s
[CV 4/5] END ...........................C=0.001;, score=0.932 total time= 0.0s
[CV 5/5] END ...........................C=0.001;, score=0.884 total time= 0.0s
[CV 1/5] END .............................C=0.1;, score=0.705 total time= 0.0s
[CV 2/5] END .............................C=0.1;, score=0.886 total time= 0.0s
[CV 3/5] END .............................C=0.1;, score=0.977 total time= 0.0s
[CV 4/5] END .............................C=0.1;, score=0.977 total time= 0.0s
[CV 5/5] END .............................C=0.1;, score=0.953 total time= 0.0s
GridSearchCV(estimator=SVC(kernel='linear', probability=True, random_state=47),
param_grid={'C': [0.0005, 0.0001, 0.001, 0.1]}, verbose=3)
%% Cell type:markdown id:bc91d663 tags:
# classification
%% Cell type:code id:5d327876 tags:
``` python
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
```
%% Output
{'C': 0.001}
SVC(C=0.001, kernel='linear', probability=True, random_state=47)
%% Cell type:code id:f8d67e2f tags:
``` python
# Select columns in df1 based on columns in df2
X_test = X_test.loc[:, X_train.columns]
```
%% Cell type:code id:e91a687d tags:
``` python
X_test=X_test.dropna(axis=1)
```
%% Cell type:code id:3b2776c0 tags:
``` python
model_svm = grid.best_estimator_
model_svm.fit(X_train,y_train)
```
%% Output
SVC(C=0.001, kernel='linear', probability=True, random_state=47)
%% Cell type:code id:eb9cd20a tags:
``` python
#######CONFUSION MATRIX ###########
from sklearn import metrics
y_test_pred_svm = model_svm.predict(X_test)
confusion_matrix_test = metrics.confusion_matrix(y_test, y_test_pred_svm)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix_test, display_labels = [False, True])
cm_display.plot()
plt.show()
```
%% Output
%% Cell type:code id:94871ada tags:
``` python
y_proba = model_svm.fit(X_train, y_train).predict_proba(X_test)[:,1]
```
%% Cell type:code id:2d5767d3 tags:
``` python
total1=sum(sum(confusion_matrix_test))
#####from confusion matrix calculate accuracy
accuracy1=(confusion_matrix_test[0,0]+confusion_matrix_test[1,1])/total1
print ('Accuracy : ', accuracy1)
sensitivity1 = confusion_matrix_test[0,0]/(confusion_matrix_test[0,0]+confusion_matrix_test[0,1])
print('Sensitivity : ', sensitivity1 )
specificity1 = confusion_matrix_test[1,1]/(confusion_matrix_test[1,0]+confusion_matrix_test[1,1])
print('Specificity : ', specificity1)
```
%% Output
Accuracy : 0.725
Sensitivity : 0.3076923076923077
Specificity : 0.9259259259259259
%% Cell type:code id:66858777 tags:
``` python
from sklearn.metrics import classification_report, confusion_matrix
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
```
%% Output
precision recall f1-score support
0 0.67 0.31 0.42 13
1 0.74 0.93 0.82 27
accuracy 0.73 40
macro avg 0.70 0.62 0.62 40
weighted avg 0.71 0.72 0.69 40
%% Cell type:code id:c1095af0 tags:
``` python
# for important features:
important_feat = model_svm.coef_[0]
#get indices of those important features
idx = important_feat.argsort(kind= "quicksort")
idx= idx[::-1][:50]
```
%% Cell type:code id:ae7e0162 tags:
``` python
df1 = X_selected.T
```
%% Cell type:code id:1d97f818 tags:
``` python
top_met = df1.iloc[idx]
```
%% Cell type:code id:4cd4227b tags:
``` python
top_met.index
```
%% Output
Index(['COL1A1', 'ECT2', 'COL7A1', 'IGF2BP2', 'COL5A2', 'NUSAP1', 'MCM2',
'IGFBP3', 'FSCN1', 'LAMB3', 'RPN1', 'AGRN', 'EFNA1', 'LAMC2', 'TAPBP',
'HSPBAP1', 'TGIF1', 'TYMP', 'ANO1', 'LCN2', 'UCHL1', 'AURKA', 'AIM2',
'RUVBL1', 'TSPAN6', 'MYH10', 'DHRS2', 'IFI35', 'ERCC3', 'ENTPD6',
'SLC2A1', 'PITX1', 'SSRP1', 'PCSK5', 'HSPD1', 'IL1RN', 'SERPINB13',
'LYPD3', 'ACLY', 'SCNN1A', 'TMF1', 'GALNT1', 'SPRR3', 'ITPKC', 'STK24',
'NT5C2', 'PTN', 'EMP1', 'TMPRSS11D', 'TST'],
dtype='object')
%% Cell type:code id:8f6d88bb tags:
``` python
X_selected.columns
```
%% Output
Index(['ACLY', 'ACTG2', 'AGRN', 'AIM2', 'ALDH9A1', 'ALOX12', 'ANO1', 'AQP3',
'ATP6V1D', 'AURKA', 'CCNG2', 'CES2', 'CFD', 'CH25H', 'CLIC3', 'COL1A1',
'COL5A2', 'COL7A1', 'CRABP2', 'CRCT1', 'CRISP3', 'CRNN', 'CYP4B1',
'DHRS1', 'DHRS2', 'DUOX1', 'DUSP5', 'ECM1', 'ECT2', 'EFNA1', 'EMP1',
'ENTPD6', 'ERCC3', 'FLG', 'FSCN1', 'GALE', 'GALNT1', 'GPX3', 'HOPX',
'HSPB8', 'HSPBAP1', 'HSPD1', 'ID4', 'IFI35', 'IGF2BP2', 'IGFBP3',
'IL1RN', 'INPP1', 'ITPKC', 'KANK1', 'KLK13', 'KRT4', 'LAMB3', 'LAMC2',
'LCN2', 'LYPD3', 'MAL', 'MCM2', 'MUC1', 'MYH10', 'NDRG2', 'NT5C2',
'NUSAP1', 'PCSK5', 'PHLDA1', 'PITX1', 'PPP1R3C', 'PTN', 'RAB11FIP1',
'RANBP9', 'RHCG', 'RND3', 'RPN1', 'RUVBL1', 'SCNN1A', 'SERPINB13',
'SERPINB2', 'SIM2', 'SLC2A1', 'SLK', 'SPRR3', 'SSRP1', 'STK24',
'SYNPO2L', 'TAPBP', 'TFAP2B', 'TGIF1', 'TIAM1', 'TJP1', 'TMF1',
'TMPRSS11D', 'TMPRSS11E', 'TSPAN6', 'TST', 'TYMP', 'UCHL1', 'ZBTB16',
'ZNF185'],
dtype='object')
%% Cell type:code id:5d9ff727 tags:
``` python
from sklearn.metrics import roc_curve, auc
# Calculate the false positive rate (FPR), true positive rate (TPR), and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
# Calculate the area under the ROC curve (AUC)
roc_auc = auc(fpr, tpr)
# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('SVM')
plt.legend(loc='lower right')
plt.show()
```
%% Output
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment