Update 2 files

- /01_Differential_Expression_Analysis/merging/merge_rm.pdf - /01_Differential_Expression_Analysis/merging/merge_main.R

Update 2 files
9648e6e0 · aakan96 · b1f26f55 · 9648e6e0 · 9648e6e0
Commit 9648e6e0 authored 1 year ago by aakan96
--- a/01_Differential_Expression_Analysis/merging/merge_main.R
+++ b/01_Differential_Expression_Analysis/merging/merge_main.R
-#data with ".top.table.tsv" are found under folder data_for_merging.
 library(readr)
 library(data.table)
 library(GEOquery)
@@ -29,13 +28,19 @@ clname<- colnames(t0)
 d_col<- as.data.frame(title0,clname)
 # get metadata out of S4 object
 gene_data_frame = fData(gset1)
 d<-merge(t0,gene_data_frame, by.x=0, by.y= "ID")
 d_f<- merge(d,resFilt_GSE43732, by.x="Row.names", by.y= "ID")
 duplicated_genes <- d_f$Row.names[duplicated(d_f$Row.names)]
 d_f1<- d_f %>% distinct(`Row.names`, .keep_all = T)
-###############miRNA2####this was just a try(not included in the project)######################################
+###############miRNA2##########################################
 #read the top significant genes found by using GEO2R tool:
 GSE67269_top_table <- read_delim("GSE67269.top.table.tsv", 
                                 delim = "\t", escape_double = FALSE, 
@@ -68,14 +73,19 @@ clname<- colnames(aggregated_data[,-1])
 h_col<- as.data.frame(title0,clname)
 # get metadata out of S4 object
 gene_data_frame = fData(gset1)
 h<-aggregated_data
 h_f<- merge(h,resFilt_GSE67269, by.x="miRNA_ID", by.y= "miRNA_ID")
 duplicated_genes <- h_f$Row.names[duplicated(h_f$Row.names)]
 h_f1<- h_f %>% distinct(`miRNA_ID`, .keep_all = T)
 #merge all the data with the help of genesymbols.
 #ab<- merge(d_f1,h_f1,by.x="Row.names",by.y="miRNA_ID")
 ####################################################
 #clean any duplicate gene symbols:
 occurrenceClean <- d_f1[!duplicated(d_f1$Row.names),]
 cleany<- occurrenceClean[ , colSums(is.na(occurrenceClean))==0]# remove columns with NA
@@ -83,7 +93,7 @@ cleany<- occurrenceClean[ , colSums(is.na(occurrenceClean))==0]# remove columns
 cleanyed<- cleany[, grep("GSM|Row.names", colnames(cleany))]
 #again remove more columns
 last_cleany<- na.omit(cleanyed)# remove any rows with NA.
-#save the data
 fwrite(d_col, file = "miRNA_DS_metadata_col_info.csv", sep = ",",row.names = TRUE)
 fwrite(last_cleany, file = "miRNA_DS_preprocessed_data.csv",sep= ",")
@@ -110,16 +120,17 @@ title0<- gset1@phenoData@data[["characteristics_ch1"]]
 clname<- colnames(t0)
 # make dataframe of the sample names and their corresponding information about cancer/non cancer/adema
 a_col<- as.data.frame(title0,clname)
-#pull out the gene information from metadata of GEO
 gene_data_frame = fData(gset1)
-#make the data ready:
 a<-merge(t0,gene_data_frame, by.x=0, by.y= "ID")
 a_f<- merge(a,resFilt_GSE70409, by.x="Row.names", by.y= "ID")
 a_f1<- a_f %>% distinct(`Gene_symbol`, .keep_all = T)
-####################do the same for another data##########################################################
+##############################################################################
 GSE20347_top_table <- read_delim("GSE20347.top.table.tsv", 
                                 delim = "\t", escape_double = FALSE, 
                                 trim_ws = TRUE)
@@ -145,6 +156,7 @@ c_col<- as.data.frame(title0,clname)
 gene_data_frame = fData(gset1)
 cc<-merge(t0,gene_data_frame, by.x=0, by.y= "ID")
 c_f<- merge(cc,resFilt_GSE20347, by.x="Row.names", by.y= "ID")
 c_f1<- c_f %>% distinct(`Gene symbol`, .keep_all = T)
@@ -220,7 +232,6 @@ binded_col<- rbind(a_col,b_col,c_col,g_col)
 #write it and save:
 fwrite(binded_col, file = "mRNA_DS_metadata_col_info.csv", sep = ",",row.names = TRUE)
 library("readxl")
-#merge with the results of multimir(just a try)
 multi<- read_excel("multimir_results_final.xlsx")
 mer<- merge(last_cleany,multi,by.x="Gene_symbol",by.y="target_symbol")
 mer1<- mer %>% distinct(`Gene_symbol`, .keep_all = T)
@@ -255,7 +266,6 @@ mer11<- t_f1[, grep("GSM|Gene symbol", colnames(t_f1))]
 mer11 <- mer11 [1: ncol(mer11)-1 ]
 mer11<-na.omit(mer11)
-#Save them all:
 fwrite(t_col, file = "mRNA_DS_metadata_col_test_info.csv", sep = ",",row.names = TRUE)
 fwrite(mer11, file = "mRNA_DS_test_data.csv",sep= ",")
@@ -263,3 +273,161 @@ mer12<- merge(x =mer1 , y = mer11, by.x = "Gene_symbol", by.y="Gene symbol", all
 merged_data <- mer12[, names(mer1)]
 merged_data<-na.omit(merged_data)
 fwrite(merged_data, file = "mRNA_DS_preprocessed_training_data.csv",sep= ",")
+# see distribution of one of deuplicated genes.
+########################################################
+# Install required packages
+#Tumor types range from 01 - 09, normal types from 10 - 19 and control samples from 20 - 29. 
+#BiocManager::install("TCGAbiolinks")
+library(TCGAbiolinks)
+library(survminer)
+library(survival)
+# getting clinical data for TCGA-BRCA cohort -------------------
+clinical_escc <- GDCquery_clinic("TCGA-ESCA")
+any(colnames(clinical_escc) %in% c("vital_status", "days_to_last_follow_up", "days_to_death"))
+which(colnames(clinical_escc) %in% c("vital_status", "days_to_last_follow_up", "days_to_death"))
+clinical_escc[,c(8,42,47)]
+# looking at some variables associated with survival 
+table(clinical_escc$vital_status)
+# change certain values the way they are encoded
+clinical_escc$deceased <- ifelse(clinical_escc$vital_status == "Alive", FALSE, TRUE)
+# create an "overall survival" variable that is equal to days_to_death
+# for dead patients, and to days_to_last_follow_up for patients who
+# are still alive
+clinical_escc$overall_survival <- ifelse(clinical_escc$vital_status == "Alive",
+                                         clinical_escc$days_to_last_follow_up,
+                                         clinical_escc$days_to_death)
+# build a query to get gene expression data for entire cohort
+query_escc_all = GDCquery(
+  project = "TCGA-ESCA",
+  data.category = "Transcriptome Profiling", # parameter enforced by GDCquery
+  experimental.strategy = "RNA-Seq",
+  workflow.type = "STAR - Counts",
+  data.type = "Gene Expression Quantification",
+  access = "open")
+output_escc <- getResults(query_escc_all)
+# get 20 primary tissue sample barcodes
+tumor <- output_escc$cases#[1:50]
+# # get gene expression data from 20 primary tumors 
+query_escc <- GDCquery(
+  project = "TCGA-ESCA",
+  data.category = "Transcriptome Profiling", # parameter enforced by GDCquery
+  experimental.strategy = "RNA-Seq",
+  workflow.type = "STAR - Counts",
+  data.type = "Gene Expression Quantification",
+  sample.type = c("Primary Tumor", "Solid Tissue Normal"),
+  access = "open",
+  barcode = tumor)
+# download data
+GDCdownload(query_escc)
+library(SummarizedExperiment)
+# get counts
+tcga_escc_data <- GDCprepare(query_escc, summarizedExperiment = T)
+escc_matrix <- assay(tcga_escc_data)
+escc_matrix[1:10,1:10]
+# extract gene and sample metadata from summarizedExperiment object
+gene_metadata <- as.data.frame(rowData(tcga_escc_data))
+coldata <- as.data.frame(colData(tcga_escc_data))
+#####################################################
+lis<- escc_matrix
+comm<- merge(lis,gene_metadata,by.x=0,by.y="gene_id")
+comm<- comm[, grep("TCGA|gene_name", colnames(comm))]
+test_tcga<- merge(comm, mer1,by.x="gene_name",by.y="Gene_symbol",all.y= T)
+test_tcga<- test_tcga[, grep("TCGA|gene_name", colnames(test_tcga))]
+sample_ids<- colnames(test_tcga)
+# Function to assign groups based on TCGA IDs
+# Function to assign groups
+assign_group <- function(tcga_id) {
+  group <- ""
+  parts <- unlist(strsplit(tcga_id, "-"))
+  if (length(parts) >= 4) {
+    fourth_part <- parts[4]
+    if (grepl("\\d{2}", fourth_part)) {
+      num <- as.numeric(substr(fourth_part, 1, 2))
+      if (num >= 10 & num <= 29) {
+        group <- "Control"
+      } else if (num >= 1 & num <= 9) {
+        group <- "Cancer"
+      }
+    }
+  }
+  return(group)
+}
+# Assign groups to TCGA IDs
+group_assignments <- sapply(sample_ids, assign_group)
+comb<- as.data.frame(group_assignments)
+#comb<- cbind(c_data,sample_labels)
+#fwrite(test_tcga, file = "mRNA_TCGA_DS_test_data.csv",sep= ",")
+#fwrite(comb, file = "mRNA_TCGA_DS_col_data.csv",sep= ",",row.names = T)
+##############################################
+# vst transform counts to be used in survival analysis ---------------
+library(DESeq2)
+# Setting up countData object   
+dds <- DESeqDataSetFromMatrix(countData = escc_matrix,
+                              colData = coldata,
+                              design = ~ 1)
+# Removing genes with sum total of 10 reads across all samples
+keep <- rowSums(counts(dds)) >= 10
+dds <- dds[keep,]
+######################################################################
+#
+################################################################
+# vst 
+vsd <- vst(dds, blind=FALSE)
+escc_matrix_vst <- assay(vsd)
+escc_matrix_vst[1:10,1:10]
+library(tidyr)
+library(dplyr)
+library(tibble)
+# Get data for TP53 gene and add gene metadata information to it -------------
+escc_gene <- escc_matrix_vst %>% 
+  as.data.frame() %>% 
+  rownames_to_column(var = 'gene_id') %>% 
+  gather(key = 'case_id', value = 'counts', -gene_id) %>% 
+  left_join(., gene_metadata, by = "gene_id") %>% 
+  filter(gene_name == "RUVBL1")
+# get median value
+median_value <- median(escc_gene$counts)
+# denote which cases have higher   lower expression than median count
+escc_gene$strata <- ifelse(escc_gene$counts >= median_value, "HIGH", "LOW")
+# Add clinical information to escc_gene
+escc_gene$case_id <- gsub('-01.*', '', escc_gene$case_id)
+escc_gene <- merge(escc_gene, clinical_escc, by.x = 'case_id', by.y = 'submitter_id')
+# Convert days to months for overall_survival variable
+escc_gene$overall_survival <- escc_gene$overall_survival / 30
+# fitting survival curve -----------
+fit <- survfit(Surv(overall_survival, deceased) ~ strata, data = escc_gene)
+fit
+ggsurvplot(fit,
+           data = escc_gene,
+           surv.median.line = "hv",
+           test.for.trend = FALSE,
+           risk.table = T)
+fit2 <- survdiff(Surv(overall_survival, deceased) ~ strata, data = escc_gene)
\ No newline at end of file
--- a/01_Differential_Expression_Analysis/merging/merge_rm.pdf
+++ b/01_Differential_Expression_Analysis/merging/merge_rm.pdf