From 489a0d5fb1c88e35ff545f79b526bf21328820ce Mon Sep 17 00:00:00 2001 From: aakan96 <aakan96@mi.fu-berlin.de> Date: Fri, 21 Jul 2023 09:21:25 +0000 Subject: [PATCH] Neue Datei hochladen --- .../TCGA_survival_analysis_described.Rmd | 179 ++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 06_Survival_Analysis/TCGA_survival_analysis_described.Rmd diff --git a/06_Survival_Analysis/TCGA_survival_analysis_described.Rmd b/06_Survival_Analysis/TCGA_survival_analysis_described.Rmd new file mode 100644 index 0000000..7ba3f02 --- /dev/null +++ b/06_Survival_Analysis/TCGA_survival_analysis_described.Rmd @@ -0,0 +1,179 @@ +--- +title: "Survival_analysis" +output: pdf_document +--- + +```{r setup, include=FALSE} +# Load required packages +#install.packages("BiocManager") +#BiocManager::install("TCGAbiolinks") +library(TCGAbiolinks) +library(survminer) +library(survival) +library(SummarizedExperiment) +library(DESeq2) +library(dplyr) +library(tidyr) +library(tibble) +# Query and retrieve clinical data for esophageal cancer (ESCC) +clinical_data_escc <- GDCquery_clinic("TCGA-ESCA") +``` +```{r} +# Check for relevant columns in clinical data +colnames_to_check <- c("vital_status", "days_to_last_follow_up", "days_to_death") +has_relevant_columns <- any(colnames(clinical_data_escc) %in% colnames_to_check) +relevant_columns_indices <- which(colnames(clinical_data_escc) %in% colnames_to_check) +relevant_columns <- clinical_data_escc[, relevant_columns_indices] + +``` + +```{r} +# Print summary of vital status +table(clinical_data_escc$vital_status) +``` +```{r} +# Create a new variable "deceased" based on vital status +clinical_data_escc$deceased <- ifelse(clinical_data_escc$vital_status == "Alive", FALSE, TRUE) +``` +```{r} +# Create an "overall_survival" variable that considers days_to_death for deceased patients and days_to_last_follow_up for alive patients +clinical_data_escc$overall_survival <- ifelse(clinical_data_escc$vital_status == "Alive", + clinical_data_escc$days_to_last_follow_up, + clinical_data_escc$days_to_death) +``` +```{r} +# Build a query to retrieve gene expression data for the entire cohort +query_escc_all <- GDCquery( + project = "TCGA-ESCA", + data.category = "Transcriptome Profiling", + experimental.strategy = "RNA-Seq", + workflow.type = "STAR - Counts", + data.type = "Gene Expression Quantification", + access = "open" +) + +output_escc <- getResults(query_escc_all) +tumor <- output_escc$cases +``` +```{r} +# Build a query to retrieve gene expression data for 20 primary tumors and solid tissue normal samples +query_escc <- GDCquery( + project = "TCGA-ESCA", + data.category = "Transcriptome Profiling", + experimental.strategy = "RNA-Seq", + workflow.type = "STAR - Counts", + data.type = "Gene Expression Quantification", + sample.type = c("Primary Tumor", "Solid Tissue Normal"), + access = "open", + barcode = tumor +) +``` +```{r} +# Download the data +GDCdownload(query_escc) +library(SummarizedExperiment) +``` + +```{r} +# Prepare the gene expression data +tcga_escc_data <- GDCprepare(query_escc, summarizedExperiment = TRUE) +escc_matrix <- assay(tcga_escc_data) +``` +```{r} +# Extract gene and sample metadata from the summarizedExperiment object +gene_metadata <- as.data.frame(rowData(tcga_escc_data)) +coldata <- as.data.frame(colData(tcga_escc_data)) +``` +```{r} +# Merge gene expression data with gene metadata using gene_id +merged_data <- merge(escc_matrix, gene_metadata, by.x = 0, by.y = "gene_id") +test_gene <-merged_data + +# Extract gene expression data for TCGA samples +sample_ids <- colnames(test_gene) +``` +```{r} +# Extract gene expression data for TCGA samples +sample_ids <- colnames(test_gene) + +# Function to assign groups based on TCGA IDs +assign_group <- function(tcga_id) { + group <- "" + parts <- unlist(strsplit(tcga_id, "-")) + + if (length(parts) >= 4) { + fourth_part <- parts[4] + if (grepl("\\d{2}", fourth_part)) { + num <- as.numeric(substr(fourth_part, 1, 2)) + if (num >= 10 & num <= 29) { + group <- "Control" + } else if (num >= 1 & num <= 9) { + group <- "Cancer" + } + } + } + + return(group) +} + +``` +```{r} +# Assign groups to TCGA IDs +group_assignments <- sapply(sample_ids, assign_group) + +# Combine group assignments with the sample data +combined_data <- as.data.frame(group_assignments) + +# VST transform counts for use in survival analysis +library(DESeq2) + +# Setting up countData object +dds <- DESeqDataSetFromMatrix(countData = escc_matrix, + colData = coldata, + design = ~ 1) +``` + +```{r} +# Removing genes with a sum total of 10 reads across all samples +keep <- rowSums(counts(dds)) >= 10 +dds <- dds[keep,] + +# VST transformation +vsd <- vst(dds, blind = FALSE) +escc_matrix_vst <- assay(vsd) + +# Get data for the RUVBL1 gene and add gene metadata information to it +gene_named <- escc_matrix %>% + as.data.frame() %>% + rownames_to_column(var = 'gene_id') %>% + gather(key = 'case_id', value = 'counts', -gene_id) %>% + left_join(., gene_metadata, by = "gene_id") %>% + filter(gene_name == "ATP6V1D") + +# Calculate the median value +median_value <- median(gene_named$counts) + +# Assign strata based on median count +gene_named$strata <- ifelse(gene_named$counts >= median_value, "HIGH", "LOW") + +# Merge clinical information with gene expression data +gene_named$case_id <- gsub('-01.*', '', gene_named$case_id) +gene_named <- merge(gene_named, clinical_data_escc, by.x = 'case_id', by.y = 'submitter_id') + +# Convert days to months for overall_survival variable +gene_named$overall_survival <- gene_named$overall_survival / 30 +``` +```{r} +# Fitting survival curve +fit <- survfit(Surv(overall_survival, deceased) ~ strata, data = gene_named) + +# Plotting survival curves +ggsurvplot(fit, + data = gene_named, + pval = TRUE, + risk.table = FALSE) +``` + + + + -- GitLab