diff --git a/01_Differential_Expression_Analysis/merging/merge_main.R b/01_Differential_Expression_Analysis/merging/merge_main.R new file mode 100644 index 0000000000000000000000000000000000000000..1168efb8d8c11a7a861a9f6e616ee64158adea01 --- /dev/null +++ b/01_Differential_Expression_Analysis/merging/merge_main.R @@ -0,0 +1,263 @@ +library(readr) +library(data.table) +library(GEOquery) +library(dplyr) +#require(hta20transcriptcluster.db) +###############miRNA1########################################## +#read the top significant genes found by using GEO2R tool: +GSE43732_top_table <- read_delim("GSE43732.top.table.tsv", + delim = "\t", escape_double = FALSE, + trim_ws = TRUE) +#filter with the help of padj. values +resFilt_GSE43732 <- GSE43732_top_table[which(GSE43732_top_table$adj.P.Val < 0.05 & abs(GSE43732_top_table$logFC) > 0.263), ] + + +# load series and platform data from GEO +gset1 <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +# choose the index +if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1 +gset1 <- gset1[[idx]] +#load the expression data. +t0<- gset1@assayData[["exprs"]] + +# extract the information of cancer/normal/adenoma from the metadata. +title0<- gset1@phenoData@data[["characteristics_ch1"]] +#extract the sample names from the expression data. +clname<- colnames(t0) +# make dataframe of the sample names and their corresponding information about cancer/non cancer/adema +d_col<- as.data.frame(title0,clname) +# get metadata out of S4 object +gene_data_frame = fData(gset1) +d<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") +d_f<- merge(d,resFilt_GSE43732, by.x="Row.names", by.y= "ID") +duplicated_genes <- d_f$Row.names[duplicated(d_f$Row.names)] +d_f1<- d_f %>% distinct(`Row.names`, .keep_all = T) + + +###############miRNA2####this was just a try(not included in the project)###################################### +#read the top significant genes found by using GEO2R tool: +GSE67269_top_table <- read_delim("GSE67269.top.table.tsv", + delim = "\t", escape_double = FALSE, + trim_ws = TRUE) +#filter with the help of padj. values +resFilt_GSE67269 <- GSE67269_top_table[which(GSE67269_top_table$adj.P.Val < 0.05 & abs(GSE67269_top_table$logFC) > 0.263), ] + + +# load series and platform data from GEO +gset1 <- getGEO("GSE67269", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +# choose the index +if (length(gset1) > 1) idx <- grep("GPL19823 ", attr(gset1, "names")) else idx <- 1 +gset1 <- gset1[[1]] +mi_ids<- as.data.frame(gset1@featureData@data[["miRNA_ID"]]) +#load the expression data. +t0<- gset1@assayData[["exprs"]] +h_exprs<- merge(mi_ids,t0,by=0) +h_exprs["Row.names"]<- NULL +head (h_exprs,20) +colnames(h_exprs)[1] <- "miRNA_ID" +h_exprs$miRNA_ID <- gsub("\\*", "", h_exprs$miRNA_ID) +# Aggregate the data by miRNA_ID and take maximum values +aggregated_data <- aggregate(. ~ miRNA_ID, h_exprs, max) + +# extract the information of cancer/normal/adenoma from the metadata. +title0<- gset1@phenoData@data[["characteristics_ch1"]] +#extract the sample names from the expression data. +clname<- colnames(aggregated_data[,-1]) +# make dataframe of the sample names and their corresponding information about cancer/non cancer/adema +h_col<- as.data.frame(title0,clname) +# get metadata out of S4 object +gene_data_frame = fData(gset1) +h<-aggregated_data + +h_f<- merge(h,resFilt_GSE67269, by.x="miRNA_ID", by.y= "miRNA_ID") +duplicated_genes <- h_f$Row.names[duplicated(h_f$Row.names)] +h_f1<- h_f %>% distinct(`miRNA_ID`, .keep_all = T) +#merge all the data with the help of genesymbols. +#ab<- merge(d_f1,h_f1,by.x="Row.names",by.y="miRNA_ID") +#################################################### +#clean any duplicate gene symbols: +occurrenceClean <- d_f1[!duplicated(d_f1$Row.names),] +cleany<- occurrenceClean[ , colSums(is.na(occurrenceClean))==0]# remove columns with NA +# remove unnecessary columns +cleanyed<- cleany[, grep("GSM|Row.names", colnames(cleany))] +#again remove more columns +last_cleany<- na.omit(cleanyed)# remove any rows with NA. +#save the data +fwrite(d_col, file = "miRNA_DS_metadata_col_info.csv", sep = ",",row.names = TRUE) +fwrite(last_cleany, file = "miRNA_DS_preprocessed_data.csv",sep= ",") + + +######################mRNA############################################################ +GSE70409_top_table <- read_delim("GSE70409.top.table.tsv", + delim = "\t", escape_double = FALSE, + trim_ws = TRUE) +#filter with the help of padj. values +resFilt_GSE70409 <- GSE70409_top_table[which(GSE70409_top_table$adj.P.Val < 0.05 & abs(GSE70409_top_table$logFC) > 0.263), ] + + +# load series and platform data from GEO +gset1 <- getGEO("GSE70409", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +# choose the index +if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1 +gset1 <- gset1[[idx]] +#load the expression data. +t0<- gset1@assayData[["exprs"]] + +# extract the information of cancer/normal/adenoma from the metadata. +title0<- gset1@phenoData@data[["characteristics_ch1"]] +#extract the sample names from the expression data. +clname<- colnames(t0) +# make dataframe of the sample names and their corresponding information about cancer/non cancer/adema +a_col<- as.data.frame(title0,clname) +#pull out the gene information from metadata of GEO +gene_data_frame = fData(gset1) +#make the data ready: +a<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") +a_f<- merge(a,resFilt_GSE70409, by.x="Row.names", by.y= "ID") +a_f1<- a_f %>% distinct(`Gene_symbol`, .keep_all = T) + + + +####################do the same for another data########################################################## +GSE20347_top_table <- read_delim("GSE20347.top.table.tsv", + delim = "\t", escape_double = FALSE, + trim_ws = TRUE) +#filter with the help of padj. values +resFilt_GSE20347 <- GSE20347_top_table[which(GSE20347_top_table$adj.P.Val < 0.05 & abs(GSE20347_top_table$logFC) > 0.263), ] + + +# load series and platform data from GEO +gset1 <- getGEO("GSE20347", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +# choose the index +if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1 +gset1 <- gset1[[idx]] +#load the expression data. +t0<- gset1@assayData[["exprs"]] + +# extract the information of cancer/normal/adenoma from the metadata. +title0<- gset1@phenoData@data[["characteristics_ch1"]] +#extract the sample names from the expression data. +clname<- colnames(t0) +# make dataframe of the sample names and their corresponding information about cancer/non cancer/adema +c_col<- as.data.frame(title0,clname) + +gene_data_frame = fData(gset1) + +cc<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") +c_f<- merge(cc,resFilt_GSE20347, by.x="Row.names", by.y= "ID") +c_f1<- c_f %>% distinct(`Gene symbol`, .keep_all = T) + + +#################do the same for another data############################################## +GSE29001_top_table <- read_delim("GSE29001.top.table.tsv", + delim = "\t", escape_double = FALSE, + trim_ws = TRUE) +resFilt_GSE29001 <- GSE29001_top_table[which(GSE29001_top_table$adj.P.Val < 0.05 & abs(GSE29001_top_table$logFC) > 0.263), ] + +gset1 <- getGEO("GSE29001", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1 +gset1 <- gset1[[idx]] +t0<- gset1@assayData[["exprs"]] + +gene_data_frame = fData(gset1) + + +title0<- gset1@phenoData@data[["characteristics_ch1.1"]] +clname<- colnames(t0) + +b_col<- as.data.frame(title0,clname) + + +b<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") + +b_f<- merge(b,resFilt_GSE29001, by.x="Row.names", by.y= "ID") +b_f1<- b_f %>% distinct(`Gene symbol`, .keep_all = T) + +############################################# +#################do the same for another data############################################## +GSE23400_top_table <- read_delim("GSE23400.top.table.tsv", + delim = "\t", escape_double = FALSE, + trim_ws = TRUE) +resFilt_GSE23400 <- GSE23400_top_table[which(GSE23400_top_table$adj.P.Val < 0.05 & abs(GSE23400_top_table$logFC) > 0.263), ] + +gset1 <- getGEO("GSE23400", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1 +gset1 <- gset1[[1]] +t0<- gset1@assayData[["exprs"]] + +gene_data_frame = fData(gset1) + + +title0<- gset1@phenoData@data[["source_name_ch1"]] +clname<- colnames(t0) + +g_col<- as.data.frame(title0,clname) + + +g<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") + +g_f<- merge(g,resFilt_GSE23400, by.x="Row.names", by.y= "ID") +g_f1<- g_f %>% distinct(`Gene symbol`, .keep_all = T) + +############################################################################### +#merge all the data with the help of genesymbols. +ab<- merge(a_f1,b_f1,by.x="Gene_symbol",by.y="Gene symbol") +abc<- merge(ab,c_f1,by.x = "Gene_symbol", by.y= "Gene.symbol") +abcd<- merge(abc,g_f1,by.x = "Gene_symbol", by.y= "Gene.symbol") +############################################################ +#order the genesymbol with help of a column: +occurrence <- abcd[order(abcd$Gene_symbol, abcd$GSM1727130, decreasing=TRUE),] +#clean any duplicate gene symbols: +occurrenceClean <- occurrence[!duplicated(occurrence$Gene_symbol),] +cleany<- occurrenceClean[ , colSums(is.na(occurrenceClean))==0]# remove columns with NA +# remove unnecessary columns +cleanyed<- cleany[, grep("GSM|Gene_symbol", colnames(cleany))] +#again remove more columns +last_cleany<- na.omit(cleanyed)# remove any rows with NA. +#bind all metadata cols +binded_col<- rbind(a_col,b_col,c_col,g_col) +#write it and save: +fwrite(binded_col, file = "mRNA_DS_metadata_col_info.csv", sep = ",",row.names = TRUE) +library("readxl") +#merge with the results of multimir(just a try) +multi<- read_excel("multimir_results_final.xlsx") +mer<- merge(last_cleany,multi,by.x="Gene_symbol",by.y="target_symbol") +mer1<- mer %>% distinct(`Gene_symbol`, .keep_all = T) +mer1<- mer1[, grep("GSM|Gene_symbol", colnames(mer1))] + +#############test_data############################ +#GSE38129 +GSE38129_top_table <- read_delim("GSE38129.top.table.tsv", + delim = "\t", escape_double = FALSE, + trim_ws = TRUE) +resFilt_GSE38129 <- GSE38129_top_table[which(GSE38129_top_table$adj.P.Val < 0.05 & abs(GSE38129_top_table$logFC) > 0.263), ] + +gset1 <- getGEO("GSE38129", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1 +gset1 <- gset1[[1]] +t0<- gset1@assayData[["exprs"]] + +gene_data_frame = fData(gset1) + + +title0<- gset1@phenoData@data[["source_name_ch1"]] +clname<- colnames(t0) + +t_col<- as.data.frame(title0,clname) + + +t<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") + +t_f<- merge(t,resFilt_GSE38129, by.x="Row.names", by.y= "ID") +t_f1<- t_f %>% distinct(`Gene symbol`, .keep_all = T) +mer11<- t_f1[, grep("GSM|Gene symbol", colnames(t_f1))] +mer11 <- mer11 [1: ncol(mer11)-1 ] +mer11<-na.omit(mer11) + + +fwrite(t_col, file = "mRNA_DS_metadata_col_test_info.csv", sep = ",",row.names = TRUE) +fwrite(mer11, file = "mRNA_DS_test_data.csv",sep= ",") +mer12<- merge(x =mer1 , y = mer11, by.x = "Gene_symbol", by.y="Gene symbol", all.y = TRUE) +merged_data <- mer12[, names(mer1)] +merged_data<-na.omit(merged_data) +fwrite(merged_data, file = "mRNA_DS_preprocessed_training_data.csv",sep= ",")