diff --git a/01_Differential_Expression_Analysis/merging/merge_main.R b/01_Differential_Expression_Analysis/merging/merge_main.R index d547dab9ada6aa27c8a1e73edd7c52cd78e59f62..3743adbd40c3ac0b7e18f760474cd810e8776a37 100644 --- a/01_Differential_Expression_Analysis/merging/merge_main.R +++ b/01_Differential_Expression_Analysis/merging/merge_main.R @@ -1,4 +1,3 @@ -#data with ".top.table.tsv" are found under folder data_for_merging. library(readr) library(data.table) library(GEOquery) @@ -29,13 +28,19 @@ clname<- colnames(t0) d_col<- as.data.frame(title0,clname) # get metadata out of S4 object gene_data_frame = fData(gset1) + d<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") + d_f<- merge(d,resFilt_GSE43732, by.x="Row.names", by.y= "ID") + duplicated_genes <- d_f$Row.names[duplicated(d_f$Row.names)] + + d_f1<- d_f %>% distinct(`Row.names`, .keep_all = T) -###############miRNA2####this was just a try(not included in the project)###################################### + +###############miRNA2########################################## #read the top significant genes found by using GEO2R tool: GSE67269_top_table <- read_delim("GSE67269.top.table.tsv", delim = "\t", escape_double = FALSE, @@ -68,14 +73,19 @@ clname<- colnames(aggregated_data[,-1]) h_col<- as.data.frame(title0,clname) # get metadata out of S4 object gene_data_frame = fData(gset1) + h<-aggregated_data h_f<- merge(h,resFilt_GSE67269, by.x="miRNA_ID", by.y= "miRNA_ID") + duplicated_genes <- h_f$Row.names[duplicated(h_f$Row.names)] + h_f1<- h_f %>% distinct(`miRNA_ID`, .keep_all = T) + #merge all the data with the help of genesymbols. #ab<- merge(d_f1,h_f1,by.x="Row.names",by.y="miRNA_ID") #################################################### + #clean any duplicate gene symbols: occurrenceClean <- d_f1[!duplicated(d_f1$Row.names),] cleany<- occurrenceClean[ , colSums(is.na(occurrenceClean))==0]# remove columns with NA @@ -83,7 +93,7 @@ cleany<- occurrenceClean[ , colSums(is.na(occurrenceClean))==0]# remove columns cleanyed<- cleany[, grep("GSM|Row.names", colnames(cleany))] #again remove more columns last_cleany<- na.omit(cleanyed)# remove any rows with NA. -#save the data + fwrite(d_col, file = "miRNA_DS_metadata_col_info.csv", sep = ",",row.names = TRUE) fwrite(last_cleany, file = "miRNA_DS_preprocessed_data.csv",sep= ",") @@ -110,16 +120,17 @@ title0<- gset1@phenoData@data[["characteristics_ch1"]] clname<- colnames(t0) # make dataframe of the sample names and their corresponding information about cancer/non cancer/adema a_col<- as.data.frame(title0,clname) -#pull out the gene information from metadata of GEO + gene_data_frame = fData(gset1) -#make the data ready: + a<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") + a_f<- merge(a,resFilt_GSE70409, by.x="Row.names", by.y= "ID") a_f1<- a_f %>% distinct(`Gene_symbol`, .keep_all = T) -####################do the same for another data########################################################## +############################################################################## GSE20347_top_table <- read_delim("GSE20347.top.table.tsv", delim = "\t", escape_double = FALSE, trim_ws = TRUE) @@ -145,6 +156,7 @@ c_col<- as.data.frame(title0,clname) gene_data_frame = fData(gset1) cc<-merge(t0,gene_data_frame, by.x=0, by.y= "ID") + c_f<- merge(cc,resFilt_GSE20347, by.x="Row.names", by.y= "ID") c_f1<- c_f %>% distinct(`Gene symbol`, .keep_all = T) @@ -220,7 +232,6 @@ binded_col<- rbind(a_col,b_col,c_col,g_col) #write it and save: fwrite(binded_col, file = "mRNA_DS_metadata_col_info.csv", sep = ",",row.names = TRUE) library("readxl") -#merge with the results of multimir(just a try) multi<- read_excel("multimir_results_final.xlsx") mer<- merge(last_cleany,multi,by.x="Gene_symbol",by.y="target_symbol") mer1<- mer %>% distinct(`Gene_symbol`, .keep_all = T) @@ -255,7 +266,6 @@ mer11<- t_f1[, grep("GSM|Gene symbol", colnames(t_f1))] mer11 <- mer11 [1: ncol(mer11)-1 ] mer11<-na.omit(mer11) -#Save them all: fwrite(t_col, file = "mRNA_DS_metadata_col_test_info.csv", sep = ",",row.names = TRUE) fwrite(mer11, file = "mRNA_DS_test_data.csv",sep= ",") @@ -263,3 +273,161 @@ mer12<- merge(x =mer1 , y = mer11, by.x = "Gene_symbol", by.y="Gene symbol", all merged_data <- mer12[, names(mer1)] merged_data<-na.omit(merged_data) fwrite(merged_data, file = "mRNA_DS_preprocessed_training_data.csv",sep= ",") +# see distribution of one of deuplicated genes. +######################################################## +# Install required packages +#Tumor types range from 01 - 09, normal types from 10 - 19 and control samples from 20 - 29. +#BiocManager::install("TCGAbiolinks") +library(TCGAbiolinks) +library(survminer) +library(survival) + +# getting clinical data for TCGA-BRCA cohort ------------------- +clinical_escc <- GDCquery_clinic("TCGA-ESCA") +any(colnames(clinical_escc) %in% c("vital_status", "days_to_last_follow_up", "days_to_death")) +which(colnames(clinical_escc) %in% c("vital_status", "days_to_last_follow_up", "days_to_death")) +clinical_escc[,c(8,42,47)] +# looking at some variables associated with survival +table(clinical_escc$vital_status) + +# change certain values the way they are encoded +clinical_escc$deceased <- ifelse(clinical_escc$vital_status == "Alive", FALSE, TRUE) + +# create an "overall survival" variable that is equal to days_to_death +# for dead patients, and to days_to_last_follow_up for patients who +# are still alive +clinical_escc$overall_survival <- ifelse(clinical_escc$vital_status == "Alive", + clinical_escc$days_to_last_follow_up, + clinical_escc$days_to_death) + +# build a query to get gene expression data for entire cohort +query_escc_all = GDCquery( + project = "TCGA-ESCA", + data.category = "Transcriptome Profiling", # parameter enforced by GDCquery + experimental.strategy = "RNA-Seq", + workflow.type = "STAR - Counts", + data.type = "Gene Expression Quantification", + access = "open") + +output_escc <- getResults(query_escc_all) +# get 20 primary tissue sample barcodes +tumor <- output_escc$cases#[1:50] + +# # get gene expression data from 20 primary tumors +query_escc <- GDCquery( + project = "TCGA-ESCA", + data.category = "Transcriptome Profiling", # parameter enforced by GDCquery + experimental.strategy = "RNA-Seq", + workflow.type = "STAR - Counts", + data.type = "Gene Expression Quantification", + sample.type = c("Primary Tumor", "Solid Tissue Normal"), + access = "open", + barcode = tumor) + +# download data +GDCdownload(query_escc) +library(SummarizedExperiment) +# get counts +tcga_escc_data <- GDCprepare(query_escc, summarizedExperiment = T) +escc_matrix <- assay(tcga_escc_data) +escc_matrix[1:10,1:10] + +# extract gene and sample metadata from summarizedExperiment object +gene_metadata <- as.data.frame(rowData(tcga_escc_data)) +coldata <- as.data.frame(colData(tcga_escc_data)) + + +##################################################### +lis<- escc_matrix +comm<- merge(lis,gene_metadata,by.x=0,by.y="gene_id") +comm<- comm[, grep("TCGA|gene_name", colnames(comm))] +test_tcga<- merge(comm, mer1,by.x="gene_name",by.y="Gene_symbol",all.y= T) +test_tcga<- test_tcga[, grep("TCGA|gene_name", colnames(test_tcga))] +sample_ids<- colnames(test_tcga) + +# Function to assign groups based on TCGA IDs +# Function to assign groups +assign_group <- function(tcga_id) { + group <- "" + parts <- unlist(strsplit(tcga_id, "-")) + + if (length(parts) >= 4) { + fourth_part <- parts[4] + if (grepl("\\d{2}", fourth_part)) { + num <- as.numeric(substr(fourth_part, 1, 2)) + if (num >= 10 & num <= 29) { + group <- "Control" + } else if (num >= 1 & num <= 9) { + group <- "Cancer" + } + } + } + + return(group) +} + +# Assign groups to TCGA IDs +group_assignments <- sapply(sample_ids, assign_group) + +comb<- as.data.frame(group_assignments) +#comb<- cbind(c_data,sample_labels) + +#fwrite(test_tcga, file = "mRNA_TCGA_DS_test_data.csv",sep= ",") +#fwrite(comb, file = "mRNA_TCGA_DS_col_data.csv",sep= ",",row.names = T) + + +############################################## + + +# vst transform counts to be used in survival analysis --------------- +library(DESeq2) +# Setting up countData object +dds <- DESeqDataSetFromMatrix(countData = escc_matrix, + colData = coldata, + design = ~ 1) + +# Removing genes with sum total of 10 reads across all samples +keep <- rowSums(counts(dds)) >= 10 +dds <- dds[keep,] +###################################################################### +# + +################################################################ + +# vst +vsd <- vst(dds, blind=FALSE) +escc_matrix_vst <- assay(vsd) +escc_matrix_vst[1:10,1:10] +library(tidyr) +library(dplyr) +library(tibble) +# Get data for TP53 gene and add gene metadata information to it ------------- +escc_gene <- escc_matrix_vst %>% + as.data.frame() %>% + rownames_to_column(var = 'gene_id') %>% + gather(key = 'case_id', value = 'counts', -gene_id) %>% + left_join(., gene_metadata, by = "gene_id") %>% + filter(gene_name == "RUVBL1") +# get median value +median_value <- median(escc_gene$counts) + +# denote which cases have higher lower expression than median count +escc_gene$strata <- ifelse(escc_gene$counts >= median_value, "HIGH", "LOW") + +# Add clinical information to escc_gene +escc_gene$case_id <- gsub('-01.*', '', escc_gene$case_id) +escc_gene <- merge(escc_gene, clinical_escc, by.x = 'case_id', by.y = 'submitter_id') +# Convert days to months for overall_survival variable +escc_gene$overall_survival <- escc_gene$overall_survival / 30 + +# fitting survival curve ----------- +fit <- survfit(Surv(overall_survival, deceased) ~ strata, data = escc_gene) +fit +ggsurvplot(fit, + data = escc_gene, + surv.median.line = "hv", + test.for.trend = FALSE, + risk.table = T) + + +fit2 <- survdiff(Surv(overall_survival, deceased) ~ strata, data = escc_gene) \ No newline at end of file diff --git a/01_Differential_Expression_Analysis/merging/merge_rm.pdf b/01_Differential_Expression_Analysis/merging/merge_rm.pdf new file mode 100644 index 0000000000000000000000000000000000000000..adbaed4e1fb19ca90e42b488ba5626aeab03c2e3 Binary files /dev/null and b/01_Differential_Expression_Analysis/merging/merge_rm.pdf differ