library(readr) library(data.table) library(GEOquery) library(dplyr) #require(hta20transcriptcluster.db) ###############miRNA1########################################## #read the top significant genes found by using GEO2R tool: GSE43732_top_table <- read_delim("GSE43732.top.table.tsv", delim = "\t", escape_double = FALSE, trim_ws = TRUE) #filter with the help of padj. values resFilt_GSE43732 <- GSE43732_top_table[which(GSE43732_top_table$adj.P.Val < 0.05 & abs(GSE43732_top_table$logFC) > 0.263), ] # load series and platform data from GEO gset1 <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) # choose the index if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1 gset1 <- gset1[[idx]] #load the expression data. t0<- gset1@assayData[["exprs"]] # extract the information of cancer/normal/adenoma from the metadata. title0<- gset1@phenoData@data[["characteristics_ch1"]] #extract the sample names from the expression data. clname<- colnames(t0) # make dataframe of the sample names and their corresponding information about cancer/non cancer/adema d_col<- as.data.frame(title0,clname) # get metadata out of S4 object gene_data_frame = fData(gset1) d<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") d_f<- merge(d,resFilt_GSE43732, by.x="Row.names", by.y= "ID") duplicated_genes <- d_f$Row.names[duplicated(d_f$Row.names)] d_f1<- d_f %>% distinct(`Row.names`, .keep_all = T) ###############miRNA2####this was just a try(not included in the project)###################################### #read the top significant genes found by using GEO2R tool: GSE67269_top_table <- read_delim("GSE67269.top.table.tsv", delim = "\t", escape_double = FALSE, trim_ws = TRUE) #filter with the help of padj. values resFilt_GSE67269 <- GSE67269_top_table[which(GSE67269_top_table$adj.P.Val < 0.05 & abs(GSE67269_top_table$logFC) > 0.263), ] # load series and platform data from GEO gset1 <- getGEO("GSE67269", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) # choose the index if (length(gset1) > 1) idx <- grep("GPL19823 ", attr(gset1, "names")) else idx <- 1 gset1 <- gset1[[1]] mi_ids<- as.data.frame(gset1@featureData@data[["miRNA_ID"]]) #load the expression data. t0<- gset1@assayData[["exprs"]] h_exprs<- merge(mi_ids,t0,by=0) h_exprs["Row.names"]<- NULL head (h_exprs,20) colnames(h_exprs)[1] <- "miRNA_ID" h_exprs$miRNA_ID <- gsub("\\*", "", h_exprs$miRNA_ID) # Aggregate the data by miRNA_ID and take maximum values aggregated_data <- aggregate(. ~ miRNA_ID, h_exprs, max) # extract the information of cancer/normal/adenoma from the metadata. title0<- gset1@phenoData@data[["characteristics_ch1"]] #extract the sample names from the expression data. clname<- colnames(aggregated_data[,-1]) # make dataframe of the sample names and their corresponding information about cancer/non cancer/adema h_col<- as.data.frame(title0,clname) # get metadata out of S4 object gene_data_frame = fData(gset1) h<-aggregated_data h_f<- merge(h,resFilt_GSE67269, by.x="miRNA_ID", by.y= "miRNA_ID") duplicated_genes <- h_f$Row.names[duplicated(h_f$Row.names)] h_f1<- h_f %>% distinct(`miRNA_ID`, .keep_all = T) #merge all the data with the help of genesymbols. #ab<- merge(d_f1,h_f1,by.x="Row.names",by.y="miRNA_ID") #################################################### #clean any duplicate gene symbols: occurrenceClean <- d_f1[!duplicated(d_f1$Row.names),] cleany<- occurrenceClean[ , colSums(is.na(occurrenceClean))==0]# remove columns with NA # remove unnecessary columns cleanyed<- cleany[, grep("GSM|Row.names", colnames(cleany))] #again remove more columns last_cleany<- na.omit(cleanyed)# remove any rows with NA. #save the data fwrite(d_col, file = "miRNA_DS_metadata_col_info.csv", sep = ",",row.names = TRUE) fwrite(last_cleany, file = "miRNA_DS_preprocessed_data.csv",sep= ",") ######################mRNA############################################################ GSE70409_top_table <- read_delim("GSE70409.top.table.tsv", delim = "\t", escape_double = FALSE, trim_ws = TRUE) #filter with the help of padj. values resFilt_GSE70409 <- GSE70409_top_table[which(GSE70409_top_table$adj.P.Val < 0.05 & abs(GSE70409_top_table$logFC) > 0.263), ] # load series and platform data from GEO gset1 <- getGEO("GSE70409", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) # choose the index if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1 gset1 <- gset1[[idx]] #load the expression data. t0<- gset1@assayData[["exprs"]] # extract the information of cancer/normal/adenoma from the metadata. title0<- gset1@phenoData@data[["characteristics_ch1"]] #extract the sample names from the expression data. clname<- colnames(t0) # make dataframe of the sample names and their corresponding information about cancer/non cancer/adema a_col<- as.data.frame(title0,clname) #pull out the gene information from metadata of GEO gene_data_frame = fData(gset1) #make the data ready: a<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") a_f<- merge(a,resFilt_GSE70409, by.x="Row.names", by.y= "ID") a_f1<- a_f %>% distinct(`Gene_symbol`, .keep_all = T) ####################do the same for another data########################################################## GSE20347_top_table <- read_delim("GSE20347.top.table.tsv", delim = "\t", escape_double = FALSE, trim_ws = TRUE) #filter with the help of padj. values resFilt_GSE20347 <- GSE20347_top_table[which(GSE20347_top_table$adj.P.Val < 0.05 & abs(GSE20347_top_table$logFC) > 0.263), ] # load series and platform data from GEO gset1 <- getGEO("GSE20347", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) # choose the index if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1 gset1 <- gset1[[idx]] #load the expression data. t0<- gset1@assayData[["exprs"]] # extract the information of cancer/normal/adenoma from the metadata. title0<- gset1@phenoData@data[["characteristics_ch1"]] #extract the sample names from the expression data. clname<- colnames(t0) # make dataframe of the sample names and their corresponding information about cancer/non cancer/adema c_col<- as.data.frame(title0,clname) gene_data_frame = fData(gset1) cc<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") c_f<- merge(cc,resFilt_GSE20347, by.x="Row.names", by.y= "ID") c_f1<- c_f %>% distinct(`Gene symbol`, .keep_all = T) #################do the same for another data############################################## GSE29001_top_table <- read_delim("GSE29001.top.table.tsv", delim = "\t", escape_double = FALSE, trim_ws = TRUE) resFilt_GSE29001 <- GSE29001_top_table[which(GSE29001_top_table$adj.P.Val < 0.05 & abs(GSE29001_top_table$logFC) > 0.263), ] gset1 <- getGEO("GSE29001", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1 gset1 <- gset1[[idx]] t0<- gset1@assayData[["exprs"]] gene_data_frame = fData(gset1) title0<- gset1@phenoData@data[["characteristics_ch1.1"]] clname<- colnames(t0) b_col<- as.data.frame(title0,clname) b<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") b_f<- merge(b,resFilt_GSE29001, by.x="Row.names", by.y= "ID") b_f1<- b_f %>% distinct(`Gene symbol`, .keep_all = T) ############################################# #################do the same for another data############################################## GSE23400_top_table <- read_delim("GSE23400.top.table.tsv", delim = "\t", escape_double = FALSE, trim_ws = TRUE) resFilt_GSE23400 <- GSE23400_top_table[which(GSE23400_top_table$adj.P.Val < 0.05 & abs(GSE23400_top_table$logFC) > 0.263), ] gset1 <- getGEO("GSE23400", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1 gset1 <- gset1[[1]] t0<- gset1@assayData[["exprs"]] gene_data_frame = fData(gset1) title0<- gset1@phenoData@data[["source_name_ch1"]] clname<- colnames(t0) g_col<- as.data.frame(title0,clname) g<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") g_f<- merge(g,resFilt_GSE23400, by.x="Row.names", by.y= "ID") g_f1<- g_f %>% distinct(`Gene symbol`, .keep_all = T) ############################################################################### #merge all the data with the help of genesymbols. ab<- merge(a_f1,b_f1,by.x="Gene_symbol",by.y="Gene symbol") abc<- merge(ab,c_f1,by.x = "Gene_symbol", by.y= "Gene.symbol") abcd<- merge(abc,g_f1,by.x = "Gene_symbol", by.y= "Gene.symbol") ############################################################ #order the genesymbol with help of a column: occurrence <- abcd[order(abcd$Gene_symbol, abcd$GSM1727130, decreasing=TRUE),] #clean any duplicate gene symbols: occurrenceClean <- occurrence[!duplicated(occurrence$Gene_symbol),] cleany<- occurrenceClean[ , colSums(is.na(occurrenceClean))==0]# remove columns with NA # remove unnecessary columns cleanyed<- cleany[, grep("GSM|Gene_symbol", colnames(cleany))] #again remove more columns last_cleany<- na.omit(cleanyed)# remove any rows with NA. #bind all metadata cols binded_col<- rbind(a_col,b_col,c_col,g_col) #write it and save: fwrite(binded_col, file = "mRNA_DS_metadata_col_info.csv", sep = ",",row.names = TRUE) library("readxl") #merge with the results of multimir(just a try) multi<- read_excel("multimir_results_final.xlsx") mer<- merge(last_cleany,multi,by.x="Gene_symbol",by.y="target_symbol") mer1<- mer %>% distinct(`Gene_symbol`, .keep_all = T) mer1<- mer1[, grep("GSM|Gene_symbol", colnames(mer1))] #############test_data############################ #GSE38129 GSE38129_top_table <- read_delim("GSE38129.top.table.tsv", delim = "\t", escape_double = FALSE, trim_ws = TRUE) resFilt_GSE38129 <- GSE38129_top_table[which(GSE38129_top_table$adj.P.Val < 0.05 & abs(GSE38129_top_table$logFC) > 0.263), ] gset1 <- getGEO("GSE38129", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) if (length(gset1) > 1) idx <- grep("GPL6480", attr(gset1, "names")) else idx <- 1 gset1 <- gset1[[1]] t0<- gset1@assayData[["exprs"]] gene_data_frame = fData(gset1) title0<- gset1@phenoData@data[["source_name_ch1"]] clname<- colnames(t0) t_col<- as.data.frame(title0,clname) t<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") t_f<- merge(t,resFilt_GSE38129, by.x="Row.names", by.y= "ID") t_f1<- t_f %>% distinct(`Gene symbol`, .keep_all = T) mer11<- t_f1[, grep("GSM|Gene symbol", colnames(t_f1))] mer11 <- mer11 [1: ncol(mer11)-1 ] mer11<-na.omit(mer11) #Save them all: fwrite(t_col, file = "mRNA_DS_metadata_col_test_info.csv", sep = ",",row.names = TRUE) fwrite(mer11, file = "mRNA_DS_test_data.csv",sep= ",") mer12<- merge(x =mer1 , y = mer11, by.x = "Gene_symbol", by.y="Gene symbol", all.y = TRUE) merged_data <- mer12[, names(mer1)] merged_data<-na.omit(merged_data) fwrite(merged_data, file = "mRNA_DS_preprocessed_training_data.csv",sep= ",")