merge_main.R

library(readr)
library(data.table)
library(GEOquery)
library(dplyr)
#require(hta20transcriptcluster.db)
###############miRNA1##########################################
#read the top significant genes found by using GEO2R tool:
GSE43732_top_table <- read_delim("GSE43732.top.table.tsv", 
                                 delim = "\t", escape_double = FALSE, 
                                 trim_ws = TRUE)
#filter with the help of padj. values
resFilt_GSE43732 <- GSE43732_top_table[which(GSE43732_top_table$adj.P.Val < 0.05 & abs(GSE43732_top_table$logFC) > 0.263), ]


# load series and platform data from GEO
gset1 <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
# choose the index
if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1
gset1 <- gset1[[idx]]
#load the expression data.
t0<- gset1@assayData[["exprs"]]

# extract the information of cancer/normal/adenoma from the metadata.
title0<- gset1@phenoData@data[["characteristics_ch1"]]
#extract the sample names from the expression data.
clname<- colnames(t0)
# make dataframe of the sample names and their corresponding information about cancer/non cancer/adema
d_col<- as.data.frame(title0,clname)
# get metadata out of S4 object
gene_data_frame = fData(gset1)
d<-merge(t0,gene_data_frame, by.x=0, by.y= "ID")
d_f<- merge(d,resFilt_GSE43732, by.x="Row.names", by.y= "ID")
duplicated_genes <- d_f$Row.names[duplicated(d_f$Row.names)]
d_f1<- d_f %>% distinct(`Row.names`, .keep_all = T)


###############miRNA2####this was just a try(not included in the project)######################################
#read the top significant genes found by using GEO2R tool:
GSE67269_top_table <- read_delim("GSE67269.top.table.tsv", 
                                 delim = "\t", escape_double = FALSE, 
                                 trim_ws = TRUE)
#filter with the help of padj. values
resFilt_GSE67269 <- GSE67269_top_table[which(GSE67269_top_table$adj.P.Val < 0.05 & abs(GSE67269_top_table$logFC) > 0.263), ]


# load series and platform data from GEO
gset1 <- getGEO("GSE67269", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
# choose the index
if (length(gset1) > 1) idx <- grep("GPL19823 ", attr(gset1, "names")) else idx <- 1
gset1 <- gset1[[1]]
mi_ids<- as.data.frame(gset1@featureData@data[["miRNA_ID"]])
#load the expression data.
t0<- gset1@assayData[["exprs"]]
h_exprs<- merge(mi_ids,t0,by=0)
h_exprs["Row.names"]<- NULL
head (h_exprs,20)
colnames(h_exprs)[1] <- "miRNA_ID"
h_exprs$miRNA_ID <- gsub("\\*", "", h_exprs$miRNA_ID)
# Aggregate the data by miRNA_ID and take maximum values
aggregated_data <- aggregate(. ~  miRNA_ID, h_exprs, max)

# extract the information of cancer/normal/adenoma from the metadata.
title0<- gset1@phenoData@data[["characteristics_ch1"]]
#extract the sample names from the expression data.
clname<- colnames(aggregated_data[,-1])
# make dataframe of the sample names and their corresponding information about cancer/non cancer/adema
h_col<- as.data.frame(title0,clname)
# get metadata out of S4 object
gene_data_frame = fData(gset1)
h<-aggregated_data

h_f<- merge(h,resFilt_GSE67269, by.x="miRNA_ID", by.y= "miRNA_ID")
duplicated_genes <- h_f$Row.names[duplicated(h_f$Row.names)]
h_f1<- h_f %>% distinct(`miRNA_ID`, .keep_all = T)
#merge all the data with the help of genesymbols.
#ab<- merge(d_f1,h_f1,by.x="Row.names",by.y="miRNA_ID")
####################################################
#clean any duplicate gene symbols:
occurrenceClean <- d_f1[!duplicated(d_f1$Row.names),]
cleany<- occurrenceClean[ , colSums(is.na(occurrenceClean))==0]# remove columns with NA
# remove unnecessary columns
cleanyed<- cleany[, grep("GSM|Row.names", colnames(cleany))]
#again remove more columns
last_cleany<- na.omit(cleanyed)# remove any rows with NA.
#save the data
fwrite(d_col, file = "miRNA_DS_metadata_col_info.csv", sep = ",",row.names = TRUE)
fwrite(last_cleany, file = "miRNA_DS_preprocessed_data.csv",sep= ",")


######################mRNA############################################################
GSE70409_top_table <- read_delim("GSE70409.top.table.tsv", 
                                 delim = "\t", escape_double = FALSE, 
                                 trim_ws = TRUE)
#filter with the help of padj. values
resFilt_GSE70409 <- GSE70409_top_table[which(GSE70409_top_table$adj.P.Val < 0.05 & abs(GSE70409_top_table$logFC) > 0.263), ]


# load series and platform data from GEO
gset1 <- getGEO("GSE70409", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
# choose the index
if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1
gset1 <- gset1[[idx]]
#load the expression data.
t0<- gset1@assayData[["exprs"]]

# extract the information of cancer/normal/adenoma from the metadata.
title0<- gset1@phenoData@data[["characteristics_ch1"]]
#extract the sample names from the expression data.
clname<- colnames(t0)
# make dataframe of the sample names and their corresponding information about cancer/non cancer/adema
a_col<- as.data.frame(title0,clname)
#pull out the gene information from metadata of GEO
gene_data_frame = fData(gset1)
#make the data ready:
a<-merge(t0,gene_data_frame, by.x=0, by.y= "ID")
a_f<- merge(a,resFilt_GSE70409, by.x="Row.names", by.y= "ID")
a_f1<- a_f %>% distinct(`Gene_symbol`, .keep_all = T)


####################do the same for another data##########################################################
GSE20347_top_table <- read_delim("GSE20347.top.table.tsv", 
                                 delim = "\t", escape_double = FALSE, 
                                 trim_ws = TRUE)
#filter with the help of padj. values
resFilt_GSE20347 <- GSE20347_top_table[which(GSE20347_top_table$adj.P.Val < 0.05 & abs(GSE20347_top_table$logFC) > 0.263), ]


# load series and platform data from GEO
gset1 <- getGEO("GSE20347", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
# choose the index
if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1
gset1 <- gset1[[idx]]
#load the expression data.
t0<- gset1@assayData[["exprs"]]

# extract the information of cancer/normal/adenoma from the metadata.
title0<- gset1@phenoData@data[["characteristics_ch1"]]
#extract the sample names from the expression data.
clname<- colnames(t0)
# make dataframe of the sample names and their corresponding information about cancer/non cancer/adema
c_col<- as.data.frame(title0,clname)

gene_data_frame = fData(gset1)

cc<-merge(t0,gene_data_frame, by.x=0, by.y= "ID")
c_f<- merge(cc,resFilt_GSE20347, by.x="Row.names", by.y= "ID")
c_f1<- c_f %>% distinct(`Gene symbol`, .keep_all = T)


#################do the same for another data##############################################
GSE29001_top_table <- read_delim("GSE29001.top.table.tsv", 
                                 delim = "\t", escape_double = FALSE, 
                                 trim_ws = TRUE)
resFilt_GSE29001 <- GSE29001_top_table[which(GSE29001_top_table$adj.P.Val < 0.05 & abs(GSE29001_top_table$logFC) > 0.263), ]

gset1 <- getGEO("GSE29001", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1
gset1 <- gset1[[idx]]
t0<- gset1@assayData[["exprs"]]

gene_data_frame = fData(gset1)


title0<- gset1@phenoData@data[["characteristics_ch1.1"]]
clname<- colnames(t0)

b_col<- as.data.frame(title0,clname)


b<-merge(t0,gene_data_frame, by.x=0, by.y= "ID")

b_f<- merge(b,resFilt_GSE29001, by.x="Row.names", by.y= "ID")
b_f1<- b_f %>% distinct(`Gene symbol`, .keep_all = T)

#############################################
#################do the same for another data##############################################
GSE23400_top_table <- read_delim("GSE23400.top.table.tsv", 
                                 delim = "\t", escape_double = FALSE, 
                                 trim_ws = TRUE)
resFilt_GSE23400 <- GSE23400_top_table[which(GSE23400_top_table$adj.P.Val < 0.05 & abs(GSE23400_top_table$logFC) > 0.263), ]

gset1 <- getGEO("GSE23400", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1
gset1 <- gset1[[1]]
t0<- gset1@assayData[["exprs"]]

gene_data_frame = fData(gset1)


title0<- gset1@phenoData@data[["source_name_ch1"]]
clname<- colnames(t0)

g_col<- as.data.frame(title0,clname)


g<-merge(t0,gene_data_frame, by.x=0, by.y= "ID")

g_f<- merge(g,resFilt_GSE23400, by.x="Row.names", by.y= "ID")
g_f1<- g_f %>% distinct(`Gene symbol`, .keep_all = T)

###############################################################################
#merge all the data with the help of genesymbols.
ab<- merge(a_f1,b_f1,by.x="Gene_symbol",by.y="Gene symbol")
abc<- merge(ab,c_f1,by.x = "Gene_symbol", by.y= "Gene.symbol")
abcd<- merge(abc,g_f1,by.x = "Gene_symbol", by.y= "Gene.symbol")
############################################################
#order the genesymbol with help of a column:
occurrence <- abcd[order(abcd$Gene_symbol, abcd$GSM1727130, decreasing=TRUE),]
#clean any duplicate gene symbols:
occurrenceClean <- occurrence[!duplicated(occurrence$Gene_symbol),]
cleany<- occurrenceClean[ , colSums(is.na(occurrenceClean))==0]# remove columns with NA
# remove unnecessary columns
cleanyed<- cleany[, grep("GSM|Gene_symbol", colnames(cleany))]
#again remove more columns
last_cleany<- na.omit(cleanyed)# remove any rows with NA.
#bind all metadata cols
binded_col<- rbind(a_col,b_col,c_col,g_col)
#write it and save:
fwrite(binded_col, file = "mRNA_DS_metadata_col_info.csv", sep = ",",row.names = TRUE)
library("readxl")
#merge with the results of multimir(just a try)
multi<- read_excel("multimir_results_final.xlsx")
mer<- merge(last_cleany,multi,by.x="Gene_symbol",by.y="target_symbol")
mer1<- mer %>% distinct(`Gene_symbol`, .keep_all = T)
mer1<- mer1[, grep("GSM|Gene_symbol", colnames(mer1))]

#############test_data############################
#GSE38129
GSE38129_top_table <- read_delim("GSE38129.top.table.tsv", 
                                 delim = "\t", escape_double = FALSE, 
                                 trim_ws = TRUE)
resFilt_GSE38129 <- GSE38129_top_table[which(GSE38129_top_table$adj.P.Val < 0.05 & abs(GSE38129_top_table$logFC) > 0.263), ]

gset1 <- getGEO("GSE38129", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1
gset1 <- gset1[[1]]
t0<- gset1@assayData[["exprs"]]

gene_data_frame = fData(gset1)


title0<- gset1@phenoData@data[["source_name_ch1"]]
clname<- colnames(t0)

t_col<- as.data.frame(title0,clname)


t<-merge(t0,gene_data_frame, by.x=0, by.y= "ID")

t_f<- merge(t,resFilt_GSE38129, by.x="Row.names", by.y= "ID")
t_f1<- t_f %>% distinct(`Gene symbol`, .keep_all = T)
mer11<- t_f1[, grep("GSM|Gene symbol", colnames(t_f1))]
mer11 <- mer11 [1: ncol(mer11)-1 ]
mer11<-na.omit(mer11)

#Save them all:

fwrite(t_col, file = "mRNA_DS_metadata_col_test_info.csv", sep = ",",row.names = TRUE)
fwrite(mer11, file = "mRNA_DS_test_data.csv",sep= ",")
mer12<- merge(x =mer1 , y = mer11, by.x = "Gene_symbol", by.y="Gene symbol", all.y = TRUE)
merged_data <- mer12[, names(mer1)]
merged_data<-na.omit(merged_data)
fwrite(merged_data, file = "mRNA_DS_preprocessed_training_data.csv",sep= ",")