Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
Datascience-SS2023
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
aakan96
Datascience-SS2023
Commits
1d57dfa3
Commit
1d57dfa3
authored
1 year ago
by
aakan96
Browse files
Options
Downloads
Patches
Plain Diff
Update file mRNA_TCGA_limma_dataset_svm_final.ipynb
parent
c9dfc4c1
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
03_Machine_Learning/mRNA_TCGA_limma_dataset_svm_final.ipynb
+10
-0
10 additions, 0 deletions
03_Machine_Learning/mRNA_TCGA_limma_dataset_svm_final.ipynb
with
10 additions
and
0 deletions
03_Machine_Learning/mRNA_TCGA_limma_dataset_svm_final.ipynb
+
10
−
0
View file @
1d57dfa3
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "4994dbd6",
"metadata": {},
"outputs": [],
"source": [
"## TRAINING SET : MERGED DATA , TESTING DATA: TCGA mRNA data"
]
},
{
"cell_type": "code",
"execution_count": 1,
...
...
%% Cell type:code id:4994dbd6 tags:
```
python
## TRAINING SET : MERGED DATA , TESTING DATA: TCGA mRNA data
```
%% Cell type:code id:f097ad55 tags:
```
python
import
warnings
warnings
.
filterwarnings
(
'
ignore
'
)
import
pandas
as
pd
from
sklearn.model_selection
import
train_test_split
from
sklearn.metrics
import
RocCurveDisplay
import
matplotlib.pyplot
as
plt
import
numpy
as
np
from
imblearn.over_sampling
import
SMOTE
from
sklearn.linear_model
import
Lasso
import
xgboost
as
xgb
from
sklearn.model_selection
import
GridSearchCV
import
pandas
as
pd
import
numpy
as
np
np
.
random
.
seed
(
7
)
```
%% Cell type:markdown id:73b6611a tags:
# Data Preprocessing
%% Cell type:code id:0eeb7a35 tags:
```
python
df_train
=
pd
.
read_csv
(
"
DS/mRNA_DS_preprocessed_training_data.csv
"
)
```
%% Cell type:code id:a6ab23aa tags:
```
python
df_test
=
pd
.
read_csv
(
"
DS/mRNA_TCGA_DS_test_data.csv
"
)
```
%% Cell type:code id:683b63ce tags:
```
python
df_train
=
df_train
.
T
df_test
=
df_test
.
T
```
%% Cell type:code id:a701e30d tags:
```
python
#df_test = df_test[:-1]
```
%% Cell type:code id:77d974be tags:
```
python
#Transform the input data
df_train
.
rename
(
columns
=
df_train
.
iloc
[
0
],
inplace
=
True
)
df_train
.
drop
(
df_train
.
index
[
0
],
inplace
=
True
)
df_train
=
df_train
.
reset_index
()
```
%% Cell type:code id:610fbe2d tags:
```
python
df_train
=
df_train
.
drop
([
'
ACPP
'
],
axis
=
1
)
```
%% Cell type:code id:2e78017d tags:
```
python
#Transform the input data
df_test
.
rename
(
columns
=
df_test
.
iloc
[
0
],
inplace
=
True
)
df_test
.
drop
(
df_test
.
index
[
0
],
inplace
=
True
)
df_test
=
df_test
.
reset_index
()
```
%% Cell type:code id:ea60801d tags:
```
python
metadata_test
=
pd
.
read_csv
(
"
DS/mRNA_TCGA_DS_col_data.csv
"
)
```
%% Cell type:code id:58d531b9 tags:
```
python
df_test
=
df_test
.
merge
(
metadata_test
,
left_on
=
"
index
"
,
right_on
=
"
Unnamed: 0
"
)
```
%% Cell type:code id:5eeaaa39 tags:
```
python
df_test
=
df_test
.
rename
(
columns
=
{
"
group_assignments
"
:
"
title0
"
})
```
%% Cell type:code id:7910f2fa tags:
```
python
df_test
[
'
title0
'
]
=
df_test
[
'
title0
'
].
replace
(
'
(?i)Control|mucosa|normal|healthy
'
,
0
,
regex
=
True
)
```
%% Cell type:code id:c102e10e tags:
```
python
df_test
[
'
title0
'
]
=
df_test
[
'
title0
'
].
replace
(
'
(?i)Tumor|Cancer|carcinoma
'
,
1
,
regex
=
True
)
```
%% Cell type:code id:ed00d2ce tags:
```
python
``
`
%%
Cell
type
:
code
id
:
6
c255d2e
tags
:
```
python
df_test['title0'].value_counts()
```
%% Output
title0
1 184
0 13
Name: count, dtype: int64
%% Cell type:code id:636b44ab tags:
```
python
df_test = df_test[pd.to_numeric(df_test['title0'], errors='coerce').notnull()]#remove all non-numeric data from the column.
```
%% Cell type:code id:34896f9a tags:
```
python
df_test= df_test.drop(['index', 'Unnamed: 0'], axis=1)
```
%% Cell type:code id:359b5bab tags:
```
python
df_test= df_test.rename(columns={"title0": "index"})
```
%% Cell type:code id:e8befa4e tags:
```
python
# Select 13 rows of class 0
class_0 = df_test[df_test['index'] == 0].head(13)
# Select 13 rows of class 1
class_1 = df_test[df_test['index'] == 1].head(27)
# Concatenate the selected rows
df_test = pd.concat([class_0, class_1])
```
%% Cell type:code id:fc9ca29c tags:
```
python
df_test['index'].value_counts()
```
%% Output
index
1 27
0 13
Name: count, dtype: int64
%% Cell type:code id:dc14bb1c tags:
```
python
X_test=df_test.drop("index",axis=1)
y_test=df_test['index']
```
%% Cell type:code id:4c50c510 tags:
```
python
metadata_train = pd.read_csv("DS/mRNA_DS_metadata_col_info.csv")
```
%% Cell type:code id:6730cf89 tags:
```
python
df_train= df_train.merge(metadata_train, left_on="index", right_on= "Unnamed: 0")
```
%% Cell type:code id:7a8ad8ad tags:
```
python
df_train['title0'] = df_train['title0'].replace('(?i)mucosa|normal|healthy', 0, regex=True)
```
%% Cell type:code id:a8cf8643 tags:
```
python
df_train['title0'] = df_train['title0'].replace('(?i)Tumor|Cancer|carcinoma', 1, regex=True)
```
%% Cell type:code id:f5d203aa tags:
```
python
df_train = df_train[pd.to_numeric(df_train['title0'], errors='coerce').notnull()]#remove all non-numeric data from the column.
```
%% Cell type:code id:523bdaa6 tags:
```
python
df_train= df_train.drop(['index', 'Unnamed: 0'], axis=1)
```
%% Cell type:code id:46a6fb36 tags:
```
python
df_train= df_train.rename(columns={"title0": "index"})
```
%% Cell type:code id:fbaf2507 tags:
```
python
df_train= df_train.apply(pd.to_numeric)
```
%% Cell type:code id:776cfbee tags:
```
python
df_train['index'].value_counts()
```
%% Output
index
0 111
1 108
Name: count, dtype: int64
%% Cell type:code id:8c0011ea tags:
```
python
X=df_train.drop("index",axis=1)
y=df_train['index']
```
%% Cell type:code id:fc606979 tags:
```
python
X=X.astype('int')
```
%% Cell type:code id:93e28118 tags:
```
python
y=y.astype('int')
```
%% Cell type:markdown id:e9830b6c tags:
# Feature Selection
%% Cell type:code id:1cc528fb tags:
```
python
# LASSO model:
lasso = Lasso(alpha=1)
# fitting the model:
lasso.fit(X, y)
# select all coefficients and the feature names
lasso_coefs = lasso.coef_
feature_names = X.columns
# collect the selected features:
selected_feature_indices = np.nonzero(lasso_coefs)[0]
selected_features = [feature_names[i] for i in selected_feature_indices]
X_selected = X.iloc[:, selected_feature_indices]
```
%% Cell type:code id:8afa29ae tags:
```
python
len(selected_features)
```
%% Output
98
%% Cell type:markdown id:6cee6462 tags:
# Test train split
%% Cell type:code id:3af09ef8 tags:
```
python
X_train = X_selected
y_train = y
```
%% Cell type:code id:129430e6 tags:
```
python
y_test.value_counts(),y_train.value_counts()
```
%% Output
(index
1 27
0 13
Name: count, dtype: int64,
index
0 111
1 108
Name: count, dtype: int64)
%% Cell type:markdown id:1cfe2a06 tags:
# Cross validation
%% Cell type:code id:1fbca4b8 tags:
```
python
from sklearn.svm import SVC
# we can add class_weight='balanced' to add panalize mistake
svm_model = SVC(kernel = "linear", probability=True,random_state=47)
```
%% Cell type:code id:0502e118 tags:
```
python
# Defining parameter range
param_grid = {
'C': [0.0005,0.0001,0.001,0.1]
}
```
%% Cell type:code id:7f2d18b0 tags:
```
python
grid = GridSearchCV(svm_model, param_grid, refit=True, verbose=3)
```
%% Cell type:code id:79790f1d tags:
```
python
# Fitting the model for grid search
grid.fit(X_train, y_train)
```
%% Output
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END ..........................C=0.0005;, score=0.955 total time= 0.0s
[CV 2/5] END ..........................C=0.0005;, score=0.932 total time= 0.0s
[CV 3/5] END ..........................C=0.0005;, score=0.955 total time= 0.0s
[CV 4/5] END ..........................C=0.0005;, score=0.932 total time= 0.0s
[CV 5/5] END ..........................C=0.0005;, score=0.884 total time= 0.0s
[CV 1/5] END ..........................C=0.0001;, score=0.932 total time= 0.0s
[CV 2/5] END ..........................C=0.0001;, score=0.795 total time= 0.0s
[CV 3/5] END ..........................C=0.0001;, score=0.955 total time= 0.0s
[CV 4/5] END ..........................C=0.0001;, score=0.886 total time= 0.0s
[CV 5/5] END ..........................C=0.0001;, score=0.860 total time= 0.0s
[CV 1/5] END ...........................C=0.001;, score=0.977 total time= 0.0s
[CV 2/5] END ...........................C=0.001;, score=0.977 total time= 0.0s
[CV 3/5] END ...........................C=0.001;, score=0.977 total time= 0.0s
[CV 4/5] END ...........................C=0.001;, score=0.932 total time= 0.0s
[CV 5/5] END ...........................C=0.001;, score=0.884 total time= 0.0s
[CV 1/5] END .............................C=0.1;, score=0.705 total time= 0.0s
[CV 2/5] END .............................C=0.1;, score=0.886 total time= 0.0s
[CV 3/5] END .............................C=0.1;, score=0.977 total time= 0.0s
[CV 4/5] END .............................C=0.1;, score=0.977 total time= 0.0s
[CV 5/5] END .............................C=0.1;, score=0.953 total time= 0.0s
GridSearchCV(estimator=SVC(kernel='linear', probability=True, random_state=47),
param_grid={'C': [0.0005, 0.0001, 0.001, 0.1]}, verbose=3)
%% Cell type:markdown id:bc91d663 tags:
# classification
%% Cell type:code id:5d327876 tags:
```
python
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
```
%% Output
{'C': 0.001}
SVC(C=0.001, kernel='linear', probability=True, random_state=47)
%% Cell type:code id:f8d67e2f tags:
```
python
# Select columns in df1 based on columns in df2
X_test = X_test.loc[:, X_train.columns]
```
%% Cell type:code id:e91a687d tags:
```
python
X_test=X_test.dropna(axis=1)
```
%% Cell type:code id:3b2776c0 tags:
```
python
model_svm = grid.best_estimator_
model_svm.fit(X_train,y_train)
```
%% Output
SVC(C=0.001, kernel='linear', probability=True, random_state=47)
%% Cell type:code id:eb9cd20a tags:
```
python
#######CONFUSION MATRIX ###########
from sklearn import metrics
y_test_pred_svm = model_svm.predict(X_test)
confusion_matrix_test = metrics.confusion_matrix(y_test, y_test_pred_svm)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix_test, display_labels = [False, True])
cm_display.plot()
plt.show()
```
%% Output
%% Cell type:code id:94871ada tags:
```
python
y_proba = model_svm.fit(X_train, y_train).predict_proba(X_test)[:,1]
```
%% Cell type:code id:2d5767d3 tags:
```
python
total1=sum(sum(confusion_matrix_test))
#####from confusion matrix calculate accuracy
accuracy1=(confusion_matrix_test[0,0]+confusion_matrix_test[1,1])/total1
print ('Accuracy : ', accuracy1)
sensitivity1 = confusion_matrix_test[0,0]/(confusion_matrix_test[0,0]+confusion_matrix_test[0,1])
print('Sensitivity : ', sensitivity1 )
specificity1 = confusion_matrix_test[1,1]/(confusion_matrix_test[1,0]+confusion_matrix_test[1,1])
print('Specificity : ', specificity1)
```
%% Output
Accuracy : 0.725
Sensitivity : 0.3076923076923077
Specificity : 0.9259259259259259
%% Cell type:code id:66858777 tags:
```
python
from sklearn.metrics import classification_report, confusion_matrix
grid_predictions = grid.predict(X_test)
print(classification_report(y_test, grid_predictions))
```
%% Output
precision recall f1-score support
0 0.67 0.31 0.42 13
1 0.74 0.93 0.82 27
accuracy 0.73 40
macro avg 0.70 0.62 0.62 40
weighted avg 0.71 0.72 0.69 40
%% Cell type:code id:c1095af0 tags:
```
python
# for important features:
important_feat = model_svm.coef_[0]
#get indices of those important features
idx = important_feat.argsort(kind= "quicksort")
idx= idx
[
::-1
][
:50
]
```
%% Cell type:code id:ae7e0162 tags:
```
python
df1 = X_selected.T
```
%% Cell type:code id:1d97f818 tags:
```
python
top_met = df1.iloc[idx]
```
%% Cell type:code id:4cd4227b tags:
```
python
top_met.index
```
%% Output
Index(['COL1A1', 'ECT2', 'COL7A1', 'IGF2BP2', 'COL5A2', 'NUSAP1', 'MCM2',
'IGFBP3', 'FSCN1', 'LAMB3', 'RPN1', 'AGRN', 'EFNA1', 'LAMC2', 'TAPBP',
'HSPBAP1', 'TGIF1', 'TYMP', 'ANO1', 'LCN2', 'UCHL1', 'AURKA', 'AIM2',
'RUVBL1', 'TSPAN6', 'MYH10', 'DHRS2', 'IFI35', 'ERCC3', 'ENTPD6',
'SLC2A1', 'PITX1', 'SSRP1', 'PCSK5', 'HSPD1', 'IL1RN', 'SERPINB13',
'LYPD3', 'ACLY', 'SCNN1A', 'TMF1', 'GALNT1', 'SPRR3', 'ITPKC', 'STK24',
'NT5C2', 'PTN', 'EMP1', 'TMPRSS11D', 'TST'],
dtype='object')
%% Cell type:code id:8f6d88bb tags:
```
python
X_selected.columns
```
%% Output
Index(['ACLY', 'ACTG2', 'AGRN', 'AIM2', 'ALDH9A1', 'ALOX12', 'ANO1', 'AQP3',
'ATP6V1D', 'AURKA', 'CCNG2', 'CES2', 'CFD', 'CH25H', 'CLIC3', 'COL1A1',
'COL5A2', 'COL7A1', 'CRABP2', 'CRCT1', 'CRISP3', 'CRNN', 'CYP4B1',
'DHRS1', 'DHRS2', 'DUOX1', 'DUSP5', 'ECM1', 'ECT2', 'EFNA1', 'EMP1',
'ENTPD6', 'ERCC3', 'FLG', 'FSCN1', 'GALE', 'GALNT1', 'GPX3', 'HOPX',
'HSPB8', 'HSPBAP1', 'HSPD1', 'ID4', 'IFI35', 'IGF2BP2', 'IGFBP3',
'IL1RN', 'INPP1', 'ITPKC', 'KANK1', 'KLK13', 'KRT4', 'LAMB3', 'LAMC2',
'LCN2', 'LYPD3', 'MAL', 'MCM2', 'MUC1', 'MYH10', 'NDRG2', 'NT5C2',
'NUSAP1', 'PCSK5', 'PHLDA1', 'PITX1', 'PPP1R3C', 'PTN', 'RAB11FIP1',
'RANBP9', 'RHCG', 'RND3', 'RPN1', 'RUVBL1', 'SCNN1A', 'SERPINB13',
'SERPINB2', 'SIM2', 'SLC2A1', 'SLK', 'SPRR3', 'SSRP1', 'STK24',
'SYNPO2L', 'TAPBP', 'TFAP2B', 'TGIF1', 'TIAM1', 'TJP1', 'TMF1',
'TMPRSS11D', 'TMPRSS11E', 'TSPAN6', 'TST', 'TYMP', 'UCHL1', 'ZBTB16',
'ZNF185'],
dtype='object')
%% Cell type:code id:5d9ff727 tags:
```
python
from sklearn.metrics import roc_curve, auc
# Calculate the false positive rate (FPR), true positive rate (TPR), and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
# Calculate the area under the ROC curve (AUC)
roc_auc = auc(fpr, tpr)
# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('SVM')
plt.legend(loc='lower right')
plt.show()
```
%% Output
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment