From a42c7e9bbb53497dd37940a4639a43f8a59cd1b5 Mon Sep 17 00:00:00 2001
From: aakan96 <aakan96@mi.fu-berlin.de>
Date: Fri, 21 Jul 2023 10:59:37 +0000
Subject: [PATCH] Neue Datei hochladen

---
 .../Deseq2/Deseq2_GSE9001.Rmd                 | 110 ++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 01_Differential_Expression_Analysis/Deseq2/Deseq2_GSE9001.Rmd

diff --git a/01_Differential_Expression_Analysis/Deseq2/Deseq2_GSE9001.Rmd b/01_Differential_Expression_Analysis/Deseq2/Deseq2_GSE9001.Rmd
new file mode 100644
index 0000000..b915614
--- /dev/null
+++ b/01_Differential_Expression_Analysis/Deseq2/Deseq2_GSE9001.Rmd
@@ -0,0 +1,110 @@
+---
+title: "Deseq2_GSE29001"
+output: pdf_document
+---
+
+```{r setup, include=FALSE}
+library(GEOquery)
+library(dplyr)
+library(magrittr)
+library(DESeq2)
+
+mRNA_GSE29001 <- getGEO("GSE29001", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
+if (length(mRNA_GSE29001) > 1) idx <- grep("GPL4508", attr(mRNA_GSE29001, "names")) else idx <- 1
+mRNA_GSE29001 <- mRNA_GSE29001[[idx]]
+GSE29001 <- mRNA_GSE29001@assayData[["exprs"]]
+colx <- mRNA_GSE29001@featureData@data[["Gene symbol"]]
+GSE29001 <- cbind(colx,GSE29001)
+
+```
+
+
+```{r}
+# Convert matrix to a dataframe
+df_GSE29001 <- as.data.frame(GSE29001)
+
+
+
+# Extract only the count matrix
+count_matrix <- df_GSE29001[, 2:ncol(df_GSE29001)]
+
+# Sample information
+sample_names <- colnames(count_matrix)
+conditions = c("normal", "normal", "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", "tumor",
+                            "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal",  "tumor", 
+                            "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", 
+                            "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor")
+
+sample_info <- data.frame(Sample = sample_names, condition = conditions)
+
+#### SKIRMISH FOR DESEQ2 DATASET ###########
+count_matrix <- as.matrix(sapply(count_matrix, as.numeric))
+count_matrix <- count_matrix[complete.cases(count_matrix), ]
+count_matrix <- round(count_matrix)
+
+#get rownames of previous data
+row_names_GSE29001 <- rownames(GSE29001)
+
+# Set gene_id to rownames again
+rownames(count_matrix) <- row_names_GSE29001
+
+# Create the DESeqDataSet using the updated sample_info
+dds <- DESeqDataSetFromMatrix(countData = count_matrix,
+                              colData = sample_info,
+                              design = ~ condition)
+
+# Perform differential gene expression analysis
+dds <- DESeq(dds)
+
+resultsNames(dds)
+
+```
+```{r}
+par(mar=c(8,5,2,2))
+boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
+
+res <- results(dds, name="condition_tumor_vs_normal")
+summary(res)
+
+```
+Box-and-Whisker Plot:depicts the distribution of gene expression values across distinct sample groups (here, normal and cancer).The boxes show the interquartile range (IQR), while the centre line inside the box reflects the median expression value. The whiskers extend from the margins of the boxes and represent the variability of the data. Outliers are points that are not within the whiskers. The figure allows us to examine the expression distributions of the two groups and discover any variations in their central tendencies and spread
+```{r}
+# Adjustment to overcome NA-error --> exclude rows containing NA's
+filtered_indices <- which(res$log2FoldChange > 1 & res$padj < 0.05 & !is.na(res$padj))
+filtered_downregulated_genes <- res[filtered_indices, ]
+gene_names <- rownames(filtered_downregulated_genes)
+head(gene_names,10)
+
+
+```
+```{r}
+rld <- rlog(dds, blind=TRUE) 
+PCA1 <- plotPCA(rld, intgroup = "condition")
+PCA1
+
+```
+Each point on the PCA plot represents a sample, and the color of the points indicates which experimental groups the samples(normal and cancer) belong to. 
+
+```{r}
+library(DESeq2)
+library(PCAtools)
+vst <- assay(vst(dds))
+p <- pca(vst, removeVar = 0.1)
+screeplot(p, axisLabSize = 18, titleLabSize = 22)
+
+```
+The scree plot depicts the proportion of variation explained by each principle component, allowing us to assess the importance of each PC in capturing the overall variability in the dataset. This figure is useful for evaluating the dimensionality of the data and estimating how many principal components are required to represent a substantial percentage of the dataset's variability.
+
+```{r}
+biplot(p, showLoadings = TRUE,
+       labSize = 5, pointSize = 5, sizeLoadingsNames = 5)
+
+
+```
+A biplot is a type of scatter plot that combines the sample scores (positions of the samples in the PCA space) and the variable loadings (contributions of the variables to each principal component) on the same plot.The distance between the sample points on the biplot illustrates how similar or distinct the gene expression patterns of the samples are.
+
+
+
+
+
+
-- 
GitLab