diff --git a/DGE/Deseq2/mRNA/.Rhistory b/DGE/Deseq2/mRNA/.Rhistory
new file mode 100644
index 0000000000000000000000000000000000000000..768671fc8456938ce255bcc0917fc8a238462f73
--- /dev/null
+++ b/DGE/Deseq2/mRNA/.Rhistory
@@ -0,0 +1,426 @@
+library(GEOquery)
+library(tidyverse)
+library(magrittr)
+mRNA_GSE20347 <- getGEO("GSE20347", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
+if (length(mRNA_GSE20347) > 1) idx <- grep("GPL4508", attr(mRNA_GSE20347, "names")) else idx <- 1
+mRNA_GSE20347 <- mRNA_GSE20347[[idx]]
+GSE20347<- mRNA_GSE20347@assayData[["exprs"]]
+colx<- mRNA_GSE20347@featureData@data[["Gene symbol"]]
+GSE20347<- cbind(colx,GSE20347)
+# Convert matrix to a dataframe
+df_GSE20347 <- as.data.frame(GSE20347)
+library(DESeq2)
+# Extract only the count matrix
+count_matrix <- df_GSE20347[, 2:ncol(df_GSE20347)]
+# Sample information
+sample_names <- colnames(count_matrix)
+# Create the sample_info data frame
+conditions <- c(rep("normal adjacent esophageal tissue", 17), rep("esophageal squamous cell carcinoma", length(sample_names) - 17))
+sample_info <- data.frame(Sample = sample_names, Condition = conditions)
+#### SKIRMISH FOR DESEQ2 DATASET ###########
+count_matrix <- as.matrix(sapply(count_matrix, as.numeric))
+count_matrix <- count_matrix[complete.cases(count_matrix), ]
+count_matrix <- round(count_matrix)
+#get rownames of previous data
+row_names_GSE20347 <- rownames(GSE20347)
+# Set gene_id to rownames again
+rownames(count_matrix) <- row_names_GSE20347
+# Create the DESeqDataSet using the updated sample_info
+dds <- DESeqDataSetFromMatrix(countData = count_matrix,
+colData = sample_info,
+design = ~ Condition)
+# Perform differential gene expression analysis
+dds <- DESeq(dds)
+resultsNames(dds)
+par(mar=c(8,5,2,2))
+boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
+res <- results(dds, name="Condition_normal.adjacent.esophageal.tissue_vs_esophageal.squamous.cell.carcinoma")
+summary(res)
+# Adjustment to overcome NA-error --> exclude rows containing NA's
+filtered_indices <- which(res$log2FoldChange < 1 & res$log2FoldChange > -1 & res$padj < 0.05 & !is.na(res$padj))
+filtered_downregulated_genes <- res[filtered_indices, ]
+gene_names <- rownames(filtered_downregulated_genes)
+gene_names
+res2 <- results(dds, name = "Condition_normal.adjacent.esophageal.tissue_vs_esophageal.squamous.cell.carcinoma")
+# Convert p-value to -log10(p-value)
+res2$log10_pvalue <- -log10(res2$pvalue)
+# Create the volcano plot
+volcano_plot <- ggplot(data = res2, aes(x = log2FoldChange, y = log10_pvalue)) +
+geom_point() +
+labs(x = "log2 Fold Change", y = "-log10 p-value") +
+theme_minimal()
+# Display the volcano plot
+print(volcano_plot)
+res2 <- results(dds, name = "Condition_normal.adjacent.esophageal.tissue_vs_esophageal.squamous.cell.carcinoma")
+# Convert p-value to -log10(p-value)
+res2$log10_pvalue <- -log10(res2$pvalue)
+# Create the volcano plot
+volcano_plot <- ggplot(data = res2, aes(x = log2FoldChange, y = log10_pvalue)) +
+geom_point() +
+labs(x = "log2 Fold Change", y = "-log10 p-value") +
+theme_minimal()
+library(GEOquery)
+library(tidyverse)
+library(magrittr)
+library(DESeq2)
+library(ggrepel)
+mRNA_GSE20347 <- getGEO("GSE20347", GSEMatrix = TRUE, AnnotGPL = TRUE, getGPL = TRUE)
+if (length(mRNA_GSE20347) > 1) idx <- grep("GPL4508", attr(mRNA_GSE20347, "names")) else idx <- 1
+mRNA_GSE20347 <- mRNA_GSE20347[[idx]]
+GSE20347 <- mRNA_GSE20347@assayData[["exprs"]]
+colx <- mRNA_GSE20347@featureData@data[["Gene symbol"]]
+GSE20347 <- cbind(colx, GSE20347)
+# Convert matrix to a dataframe
+df_GSE20347 <- as.data.frame(GSE20347)
+# Extract only the count matrix
+count_matrix <- df_GSE20347[, 2:ncol(df_GSE20347)]
+# Sample information
+sample_names <- colnames(count_matrix)
+# Create the sample_info data frame
+conditions <- c(rep("normal adjacent esophageal tissue", 17), rep("esophageal squamous cell carcinoma", length(sample_names) - 17))
+sample_info <- data.frame(Sample = sample_names, Condition = conditions)
+#### SKIRMISH FOR DESEQ2 DATASET ###########
+count_matrix <- as.matrix(sapply(count_matrix, as.numeric))
+count_matrix <- count_matrix[complete.cases(count_matrix), ]
+count_matrix <- round(count_matrix)
+# Get rownames of previous data
+row_names_GSE20347 <- rownames(GSE20347)
+# Set gene_id to rownames again
+rownames(count_matrix) <- row_names_GSE20347
+# Create the DESeqDataSet using the updated sample_info
+dds <- DESeqDataSetFromMatrix(countData = count_matrix,
+colData = sample_info,
+design = ~ Condition)
+# Perform differential gene expression analysis
+dds <- DESeq(dds)
+# Volcano plot
+res <- results(dds)
+res$logPadj <- -log10(res$padj)
+res$logFC <- res$log2FoldChange
+res$Sig <- ifelse(abs(res$log2FoldChange) > 1 & res$padj < 0.05, "Yes", "No")
+# Plot volcano plot
+ggplot(res, aes(x = logFC, y = logPadj, color = Sig)) +
+geom_point(alpha = 0.6) +
+geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") +
+geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "blue") +
+xlim(c(-max(abs(res$log2FoldChange)), max(abs(res$log2FoldChange)))) +
+labs(x = "log2 Fold Change", y = "-log10(Adjusted p-value)", title = "Volcano Plot") +
+theme_minimal() +
+theme(legend.position = "none")
+library(GEOquery)
+library(tidyverse)
+library(magrittr)
+mRNA_GSE20347 <- getGEO("GSE20347", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
+if (length(mRNA_GSE20347) > 1) idx <- grep("GPL4508", attr(mRNA_GSE20347, "names")) else idx <- 1
+mRNA_GSE20347 <- mRNA_GSE20347[[idx]]
+GSE20347<- mRNA_GSE20347@assayData[["exprs"]]
+colx<- mRNA_GSE20347@featureData@data[["Gene symbol"]]
+GSE20347<- cbind(colx,GSE20347)
+# Convert matrix to a dataframe
+df_GSE20347 <- as.data.frame(GSE20347)
+library(DESeq2)
+# Extract only the count matrix
+count_matrix <- df_GSE20347[, 2:ncol(df_GSE20347)]
+# Sample information
+sample_names <- colnames(count_matrix)
+# Create the sample_info data frame
+conditions <- c(rep("normal adjacent esophageal tissue", 17), rep("esophageal squamous cell carcinoma", length(sample_names) - 17))
+sample_info <- data.frame(Sample = sample_names, Condition = conditions)
+#### SKIRMISH FOR DESEQ2 DATASET ###########
+count_matrix <- as.matrix(sapply(count_matrix, as.numeric))
+count_matrix <- count_matrix[complete.cases(count_matrix), ]
+count_matrix <- round(count_matrix)
+#get rownames of previous data
+row_names_GSE20347 <- rownames(GSE20347)
+# Set gene_id to rownames again
+rownames(count_matrix) <- row_names_GSE20347
+# Create the DESeqDataSet using the updated sample_info
+dds <- DESeqDataSetFromMatrix(countData = count_matrix,
+colData = sample_info,
+design = ~ Condition)
+# Perform differential gene expression analysis
+dds <- DESeq(dds)
+resultsNames(dds)
+par(mar=c(8,5,2,2))
+boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
+res <- results(dds, name="Condition_normal.adjacent.esophageal.tissue_vs_esophageal.squamous.cell.carcinoma")
+summary(res)
+# Adjustment to overcome NA-error --> exclude rows containing NA's
+filtered_indices <- which(res$log2FoldChange < 1 & res$log2FoldChange > -1 & res$padj < 0.05 & !is.na(res$padj))
+filtered_downregulated_genes <- res[filtered_indices, ]
+gene_names <- rownames(filtered_downregulated_genes)
+gene_names
+res2 <- as.data.frame(results(dds, name = "Condition_normal.adjacent.esophageal.tissue_vs_esophageal.squamous.cell.carcinoma"))
+# Convert p-value to -log10(p-value)
+res2$log10_pvalue <- -log10(res2$pvalue)
+# Create the volcano plot
+volcano_plot <- ggplot(data = res2, aes(x = log2FoldChange, y = log10_pvalue)) +
+geom_point() +
+labs(x = "log2 Fold Change", y = "-log10 p-value") +
+theme_minimal()
+# Display the volcano plot
+print(volcano_plot)
+# Add vertical lines for fold change thresholds
+volcano_plot <- volcano_plot +
+geom_vline(xintercept = c(-1, 1), color = "red") +
+geom_hline(yintercept = -log10(0.05), color = "red")
+# Display the customized volcano plot
+print(volcano_plot)
+# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
+################################################################
+#   Differential expression analysis with limma
+library(GEOquery)
+library(limma)
+library(umap)
+gset <- getGEO("GSE20347", GSEMatrix =TRUE, AnnotGPL=TRUE)
+if (length(gset) > 1) idx <- grep("GPL571", attr(gset, "names")) else idx <- 1
+gset <- gset[[idx]]
+# make proper column names to match toptable
+fvarLabels(gset) <- make.names(fvarLabels(gset))
+# group membership for all samples
+gsms <- "0000000000000000011111111111111111"
+sml <- strsplit(gsms, split="")[[1]]
+# log2 transformation
+ex <- exprs(gset)
+qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
+LogC <- (qx[5] > 100) ||
+(qx[6]-qx[1] > 50 && qx[2] > 0)
+if (LogC) { ex[which(ex <= 0)] <- NaN
+exprs(gset) <- log2(ex) }
+# assign samples to groups and set up design matrix
+gs <- factor(sml)
+groups <- make.names(c("normal","cancer"))
+levels(gs) <- groups
+gset$group <- gs
+design <- model.matrix(~group + 0, gset)
+colnames(design) <- levels(gs)
+gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
+fit <- lmFit(gset, design)  # fit linear model
+# set up contrasts of interest and recalculate model coefficients
+cts <- c(paste(groups[1],"-",groups[2],sep=""))
+cont.matrix <- makeContrasts(contrasts=cts, levels=design)
+fit2 <- contrasts.fit(fit, cont.matrix)
+# compute statistics and table of top significant genes
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","GB_ACC","SPOT_ID","Gene.Symbol","Gene.symbol","Gene.title"))
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+ylab = "Number of genes", main = "P-adj value distribution")
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1)
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+View(tT2)
+View(tT)
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+# MD plot (log fold change vs mean log expression)
+# highlight statistically significant (p-adj < 0.05) probes
+plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
+abline(h=0)
+################################################################
+# General expression data analysis
+ex <- exprs(gset)
+# box-and-whisker plot
+dev.new(width=3+ncol(gset)/6, height=5)
+ord <- order(gs)  # order samples by group
+palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
+"#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
+library(GEOquery)
+library(limma)
+library(umap)
+gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE)
+if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1
+gset <- gset[[idx]]
+# make proper column names to match toptable
+fvarLabels(gset) <- make.names(fvarLabels(gset))
+# group membership for all samples
+gsms <- paste0("11110000111100001111000011110000111100001111000011",
+"11000011110000111100001111000011110000111100001111",
+"00001111000011110000111100000111100011110000111100",
+"00111100001011110000111100001111000011110000111100",
+"00111000111100001111000011100011110000")
+sml <- strsplit(gsms, split="")[[1]]
+# log2 transformation
+ex <- exprs(gset)
+qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
+LogC <- (qx[5] > 100) ||
+(qx[6]-qx[1] > 50 && qx[2] > 0)
+if (LogC) { ex[which(ex <= 0)] <- NaN
+exprs(gset) <- log2(ex) }
+# assign samples to groups and set up design matrix
+gs <- factor(sml)
+groups <- make.names(c("Normal","Cancer"))
+levels(gs) <- groups
+gset$group <- gs
+design <- model.matrix(~group + 0, gset)
+colnames(design) <- levels(gs)
+gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
+fit <- lmFit(gset, design)  # fit linear model
+# set up contrasts of interest and recalculate model coefficients
+cts <- paste(groups[1], groups[2], sep="-")
+cont.matrix <- makeContrasts(contrasts=cts, levels=design)
+fit2 <- contrasts.fit(fit, cont.matrix)
+# compute statistics and table of top significant genes
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID"))
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+ylab = "Number of genes", main = "P-adj value distribution")
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1)
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+# MD plot (log fold change vs mean log expression)
+# highlight statistically significant (p-adj < 0.05) probes
+plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
+abline(h=0)
+################################################################
+# General expression data analysis
+ex <- exprs(gset)
+# box-and-whisker plot
+dev.new(width=3+ncol(gset)/6, height=5)
+ord <- order(gs)  # order samples by group
+palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
+"#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
+par(mar=c(7,4,2,1))
+title <- paste ("GSE43732", "/", annotation(gset), sep ="")
+boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
+legend("topleft", groups, fill=palette(), bty="n")
+dev.off()
+# expression value distribution
+par(mar=c(4,4,2,1))
+title <- paste ("GSE43732", "/", annotation(gset), " value distribution", sep ="")
+plotDensities(ex, group=gs, main=title, legend ="topright")
+# UMAP plot (dimensionality reduction)
+ex <- na.omit(ex) # eliminate rows with NAs
+ex <- ex[!duplicated(ex), ]  # remove duplicates
+ump <- umap(t(ex), n_neighbors = 15, random_state = 123)
+par(mar=c(3,3,2,6), xpd=TRUE)
+plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5)
+legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20,
+col=1:nlevels(gs), title="Group", pt.cex=1.5)
+library("maptools")  # point labels without overlaps
+pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
+# mean-variance trend, helps to see if precision weights are needed
+plotSA(fit2, main="Mean variance trend, GSE43732")
+# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
+################################################################
+#   Differential expression analysis with limma
+library(GEOquery)
+library(limma)
+library(umap)
+gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE)
+if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1
+gset <- gset[[idx]]
+# make proper column names to match toptable
+fvarLabels(gset) <- make.names(fvarLabels(gset))
+# group membership for all samples
+gsms <- paste0("00001111000011110000111100001111000011110000111100",
+"00111100001111000011110000111100001111000011110000",
+"11110000111100001111000011111000011100001111000011",
+"11000011110100001111000011110000111100001111000011",
+"11000111000011110000111100011100001111")
+sml <- strsplit(gsms, split="")[[1]]
+# log2 transformation
+ex <- exprs(gset)
+qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
+LogC <- (qx[5] > 100) ||
+(qx[6]-qx[1] > 50 && qx[2] > 0)
+if (LogC) { ex[which(ex <= 0)] <- NaN
+exprs(gset) <- log2(ex) }
+# assign samples to groups and set up design matrix
+gs <- factor(sml)
+groups <- make.names(c("cancer","normal"))
+levels(gs) <- groups
+gset$group <- gs
+design <- model.matrix(~group + 0, gset)
+colnames(design) <- levels(gs)
+gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
+fit <- lmFit(gset, design)  # fit linear model
+# set up contrasts of interest and recalculate model coefficients
+cts <- paste(groups[1], groups[2], sep="-")
+cont.matrix <- makeContrasts(contrasts=cts, levels=design)
+fit2 <- contrasts.fit(fit, cont.matrix)
+# compute statistics and table of top significant genes
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID"))
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+ylab = "Number of genes", main = "P-adj value distribution")
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=0)
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+# MD plot (log fold change vs mean log expression)
+# highlight statistically significant (p-adj < 0.05) probes
+plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
+abline(h=0)
+################################################################
+# General expression data analysis
+ex <- exprs(gset)
+# box-and-whisker plot
+dev.new(width=3+ncol(gset)/6, height=5)
+ord <- order(gs)  # order samples by group
+palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
+"#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
+par(mar=c(7,4,2,1))
+title <- paste ("GSE43732", "/", annotation(gset), sep ="")
+boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
+legend("topleft", groups, fill=palette(), bty="n")
+dev.off()
+# expression value distribution
+par(mar=c(4,4,2,1))
+title <- paste ("GSE43732", "/", annotation(gset), " value distribution", sep ="")
+plotDensities(ex, group=gs, main=title, legend ="topright")
+# UMAP plot (dimensionality reduction)
+ex <- na.omit(ex) # eliminate rows with NAs
+ex <- ex[!duplicated(ex), ]  # remove duplicates
+ump <- umap(t(ex), n_neighbors = 15, random_state = 123)
+par(mar=c(3,3,2,6), xpd=TRUE)
+plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5)
+legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20,
+col=1:nlevels(gs), title="Group", pt.cex=1.5)
+library("maptools")  # point labels without overlaps
+pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
+# mean-variance trend, helps to see if precision weights are needed
+plotSA(fit2, main="Mean variance trend, GSE43732")
diff --git "a/DGE/Deseq2/mRNA/GSE20347\342\200\213 mRNA\342\200\213.R" "b/DGE/Deseq2/mRNA/GSE20347\342\200\213 mRNA\342\200\213.R"
new file mode 100644
index 0000000000000000000000000000000000000000..d6713d804dca6b8f63734f225deb28cc967fad48
--- /dev/null
+++ "b/DGE/Deseq2/mRNA/GSE20347\342\200\213 mRNA\342\200\213.R"	
@@ -0,0 +1,94 @@
+library(GEOquery)
+library(tidyverse)
+library(magrittr)
+
+mRNA_GSE20347 <- getGEO("GSE20347", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
+if (length(mRNA_GSE20347) > 1) idx <- grep("GPL4508", attr(mRNA_GSE20347, "names")) else idx <- 1
+mRNA_GSE20347 <- mRNA_GSE20347[[idx]]
+GSE20347<- mRNA_GSE20347@assayData[["exprs"]]
+colx<- mRNA_GSE20347@featureData@data[["Gene symbol"]]
+GSE20347<- cbind(colx,GSE20347)
+
+# Convert matrix to a dataframe
+df_GSE20347 <- as.data.frame(GSE20347)
+
+library(DESeq2)
+
+# Extract only the count matrix
+count_matrix <- df_GSE20347[, 2:ncol(df_GSE20347)]
+
+# Sample information
+sample_names <- colnames(count_matrix)
+
+# Create the sample_info data frame
+conditions <- c(rep("normal adjacent esophageal tissue", 17), rep("esophageal squamous cell carcinoma", length(sample_names) - 17))
+sample_info <- data.frame(Sample = sample_names, Condition = conditions)
+
+#### SKIRMISH FOR DESEQ2 DATASET ###########
+count_matrix <- as.matrix(sapply(count_matrix, as.numeric))
+count_matrix <- count_matrix[complete.cases(count_matrix), ]
+count_matrix <- round(count_matrix)
+
+#get rownames of previous data
+row_names_GSE20347 <- rownames(GSE20347)
+
+# Set gene_id to rownames again
+rownames(count_matrix) <- row_names_GSE20347
+
+# Create the DESeqDataSet using the updated sample_info
+dds <- DESeqDataSetFromMatrix(countData = count_matrix,
+                              colData = sample_info,
+                              design = ~ Condition)
+
+# Perform differential gene expression analysis
+dds <- DESeq(dds)
+
+resultsNames(dds)
+
+par(mar=c(8,5,2,2))
+boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
+
+res <- results(dds, name="Condition_normal.adjacent.esophageal.tissue_vs_esophageal.squamous.cell.carcinoma")
+summary(res)
+
+# Adjustment to overcome NA-error --> exclude rows containing NA's
+filtered_indices <- which(res$log2FoldChange < 1 & res$log2FoldChange > -1 & res$padj < 0.05 & !is.na(res$padj))
+filtered_downregulated_genes <- res[filtered_indices, ]
+gene_names <- rownames(filtered_downregulated_genes)
+gene_names
+
+
+res2 <- as.data.frame(results(dds, name = "Condition_normal.adjacent.esophageal.tissue_vs_esophageal.squamous.cell.carcinoma"))
+
+# Convert p-value to -log10(p-value)
+res2$log10_pvalue <- -log10(res2$pvalue)
+
+# Create the volcano plot
+volcano_plot <- ggplot(data = res2, aes(x = log2FoldChange, y = log10_pvalue)) +
+  geom_point() +
+  labs(x = "log2 Fold Change", y = "-log10 p-value") +
+  theme_minimal()
+
+# Display the volcano plot
+print(volcano_plot)
+
+# Add vertical lines for fold change thresholds
+volcano_plot <- volcano_plot +
+  geom_vline(xintercept = c(-1, 1), color = "red") +
+  geom_hline(yintercept = -log10(0.05), color = "red")
+
+# Display the customized volcano plot
+print(volcano_plot)
+
+
+
+rld <- rlog(dds, blind=TRUE) 
+PCA1 <- plotPCA(rld, intgroup = "Condition")
+PCA1
+
+vst <- assay(vst(dds))
+p <- pca(vst, removeVar = 0.1)
+screeplot(p, axisLabSize = 18, titleLabSize = 22)
+
+biplot(p, showLoadings = TRUE,
+       labSize = 5, pointSize = 5, sizeLoadingsNames = 5)
diff --git "a/DGE/Deseq2/mRNA/GSE29001\342\200\213 mRNA\342\200\213.R" "b/DGE/Deseq2/mRNA/GSE29001\342\200\213 mRNA\342\200\213.R"
new file mode 100644
index 0000000000000000000000000000000000000000..c8be78f1ee4568c17d6cd3615dc6b254f18013b7
--- /dev/null
+++ "b/DGE/Deseq2/mRNA/GSE29001\342\200\213 mRNA\342\200\213.R"	
@@ -0,0 +1,71 @@
+library(GEOquery)
+library(tidyverse)
+library(magrittr)
+
+mRNA_GSE29001 <- getGEO("GSE29001", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
+if (length(mRNA_GSE29001) > 1) idx <- grep("GPL4508", attr(mRNA_GSE29001, "names")) else idx <- 1
+mRNA_GSE29001 <- mRNA_GSE29001[[idx]]
+GSE29001 <- mRNA_GSE29001@assayData[["exprs"]]
+colx <- mRNA_GSE29001@featureData@data[["Gene symbol"]]
+GSE29001 <- cbind(colx,GSE29001)
+
+# Convert matrix to a dataframe
+df_GSE29001 <- as.data.frame(GSE29001)
+
+library(DESeq2)
+
+# Extract only the count matrix
+count_matrix <- df_GSE29001[, 2:ncol(df_GSE29001)]
+
+# Sample information
+sample_names <- colnames(count_matrix)
+conditions = c("normal", "normal", "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", "tumor",
+                            "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal",  "tumor", 
+                            "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", 
+                            "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor")
+
+sample_info <- data.frame(Sample = sample_names, condition = conditions)
+
+#### SKIRMISH FOR DESEQ2 DATASET ###########
+count_matrix <- as.matrix(sapply(count_matrix, as.numeric))
+count_matrix <- count_matrix[complete.cases(count_matrix), ]
+count_matrix <- round(count_matrix)
+
+#get rownames of previous data
+row_names_GSE29001 <- rownames(GSE29001)
+
+# Set gene_id to rownames again
+rownames(count_matrix) <- row_names_GSE29001
+
+# Create the DESeqDataSet using the updated sample_info
+dds <- DESeqDataSetFromMatrix(countData = count_matrix,
+                              colData = sample_info,
+                              design = ~ condition)
+
+# Perform differential gene expression analysis
+dds <- DESeq(dds)
+
+resultsNames(dds)
+
+par(mar=c(8,5,2,2))
+boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
+
+res <- results(dds, name="condition_tumor_vs_normal")
+summary(res)
+
+# Adjustment to overcome NA-error --> exclude rows containing NA's
+filtered_indices <- which(res$log2FoldChange > 1 & res$padj < 0.05 & !is.na(res$padj))
+filtered_downregulated_genes <- res[filtered_indices, ]
+gene_names <- rownames(filtered_downregulated_genes)
+gene_names
+
+rld <- rlog(dds, blind=TRUE) 
+PCA1 <- plotPCA(rld, intgroup = "condition")
+PCA1
+
+vst <- assay(vst(dds))
+p <- pca(vst, removeVar = 0.1)
+screeplot(p, axisLabSize = 18, titleLabSize = 22)
+
+biplot(p, showLoadings = TRUE,
+       labSize = 5, pointSize = 5, sizeLoadingsNames = 5)
diff --git "a/DGE/Deseq2/mRNA/GSE70409\342\200\213 mRNA\342\200\213.R" "b/DGE/Deseq2/mRNA/GSE70409\342\200\213 mRNA\342\200\213.R"
new file mode 100644
index 0000000000000000000000000000000000000000..06db7568eccf467e15bf67cd422a0e717cc35ccd
--- /dev/null
+++ "b/DGE/Deseq2/mRNA/GSE70409\342\200\213 mRNA\342\200\213.R"	
@@ -0,0 +1,71 @@
+library(GEOquery)
+library(tidyverse)
+library(magrittr)
+library(PCAtools)
+
+mRNA_GSE70409 <- getGEO("GSE70409", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
+if (length(mRNA_GSE70409) > 1) idx <- grep("GPL4508", attr(mRNA_GSE70409, "names")) else idx <- 1
+mRNA_GSE70409 <- mRNA_GSE70409[[idx]]
+GSE70409 <- mRNA_GSE70409@assayData[["exprs"]]
+colx<- mRNA_GSE70409@featureData@data[["Gene_symbol"]]
+GSE70409 <- cbind(colx,GSE70409)
+
+# Convert matrix to a dataframe
+df_GSE70409 <- as.data.frame(GSE70409)
+
+library(DESeq2)
+
+# Extract only the count matrix
+count_matrix <- df_GSE70409[, 2:ncol(df_GSE70409)]
+
+# Sample information
+sample_names <- colnames(count_matrix)
+conditions = c("normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",   
+                "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", 
+                "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor")
+
+sample_info <- data.frame(Sample = sample_names, condition = conditions)
+
+#### SKIRMISH FOR DESEQ2 DATASET ###########
+count_matrix <- as.matrix(sapply(count_matrix, as.numeric))
+count_matrix <- count_matrix[complete.cases(count_matrix), ]
+count_matrix <- round(count_matrix)
+
+#get rownames of previous data
+row_names_GSE70409 <- rownames(GSE70409)
+
+# Set gene_id to rownames again
+rownames(count_matrix) <- row_names_GSE70409
+
+# Create the DESeqDataSet using the updated sample_info
+dds <- DESeqDataSetFromMatrix(countData = count_matrix,
+                              colData = sample_info,
+                              design = ~ condition)
+
+# Perform differential gene expression analysis
+dds <- DESeq(dds)
+resultsNames(dds)
+
+par(mar=c(8,5,2,2))
+boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
+
+res <- results(dds, name="condition_tumor_vs_normal")
+summary(res)
+
+# Adjustment to overcome NA-error --> exclude rows containing NA's
+filtered_indices <- which(res$log2FoldChange > 1 & res$padj < 0.05 & !is.na(res$padj))
+filtered_downregulated_genes <- res[filtered_indices, ]
+gene_names <- rownames(filtered_downregulated_genes)
+gene_names
+
+
+rld <- rlog(dds, blind=TRUE) 
+PCA1 <- plotPCA(rld, intgroup = "condition")
+PCA1
+
+vst <- assay(vst(dds))
+p <- pca(vst, removeVar = 0.1)
+screeplot(p, axisLabSize = 18, titleLabSize = 22)
+
+biplot(p, showLoadings = TRUE,
+       labSize = 5, pointSize = 5, sizeLoadingsNames = 5)
diff --git a/DGE/Deseq2/miRNA/GSE43732_miRNA.R b/DGE/Deseq2/miRNA/GSE43732_miRNA.R
new file mode 100644
index 0000000000000000000000000000000000000000..4c9831a5b6a506bf1b5fd985bcb904d6e5fb87a0
--- /dev/null
+++ b/DGE/Deseq2/miRNA/GSE43732_miRNA.R
@@ -0,0 +1,115 @@
+library(GEOquery)
+library(tidyverse)
+library(magrittr)
+library(multiMiR)
+library(writexl)
+
+miRNA_GSE43732 <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
+if (length(miRNA_GSE43732) > 1) idx <- grep("GPL4508", attr(miRNA_GSE43732, "names")) else idx <- 1
+miRNA_GSE43732 <- miRNA_GSE43732[[idx]]
+GSE43732 <- miRNA_GSE43732@assayData[["exprs"]]
+colx<- miRNA_GSE43732@featureData@data[["Name"]]
+GSE43732 <- cbind(colx,GSE43732)
+
+
+# Convert matrix to a dataframe
+df_GSE43732 <- as.data.frame(GSE43732)
+
+library(DESeq2)
+
+# Extract only the count matrix
+count_matrix <- df_GSE43732[, 1:ncol(df_GSE43732)]
+
+# Sample information
+sample_names <- colnames(count_matrix)
+conditions = c("cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",  
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",  
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "normal", 
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", 
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal")
+
+sample_info <- data.frame(Sample = sample_names, condition = conditions)
+
+#### SKIRMISH FOR DESEQ2 DATASET ###########
+#make NA's to 0
+count_matrix <- as.matrix(sapply(count_matrix, as.numeric))
+count_matrix[is.na(count_matrix)] <- 0
+count_matrix <- round(count_matrix)
+
+#get rownames of previous data
+row_names_GSE43732 <- rownames(GSE43732)
+
+# Set gene_id to rownames again
+rownames(count_matrix) <- row_names_GSE43732
+
+#negative values to abs() --> simulate counts
+count_matrix <- abs(count_matrix)
+
+
+# Create the DESeqDataSet using the updated sample_info
+dds <- DESeqDataSetFromMatrix(countData = count_matrix,
+                              colData = sample_info,
+                              design = ~ condition)
+
+# Perform differential gene expression analysis
+dds <- DESeq(dds)
+
+resultsNames(dds)
+
+par(mar=c(8,5,2,2))
+boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
+
+res <- results(dds, name="condition_normal_vs_cancer")
+summary(res)
+
+# Adjustment to overcome NA-error --> exclude rows containing NA's
+filtered_indices <- which(res$log2FoldChange > 1 & res$padj < 0.05 & !is.na(res$padj))
+filtered_downregulated_genes <- res[filtered_indices, ]
+gene_names <- rownames(filtered_downregulated_genes)
+gene_names
+
+multimir_results <- get_multimir(org     = 'hsa',
+                                 mirna   = gene_names,
+                                 table   = 'validated',
+                                 summary = TRUE)
+
+targets <- multimir_results@data
+excel_file <- "C:/Users/Emre/Desktop/Bioinformatik/Bioinformatik 2.Semester Master/Data Science in the Life Sciences/Project/multimir_results.xlsx"
+write_xlsx(targets, path = excel_file )
+
+rld <- rlog(dds, blind=TRUE) 
+PCA1 <- plotPCA(rld, intgroup = "condition")
+PCA1
+
+vst <- assay(vst(dds))
+p <- pca(vst, removeVar = 0.1)
+screeplot(p, axisLabSize = 18, titleLabSize = 22)
+
+biplot(p, showLoadings = TRUE,
+       labSize = 5, pointSize = 5, sizeLoadingsNames = 5)
diff --git a/DGE/Deseq2/miRNA/GSE43732_miRNA_Multimir.R b/DGE/Deseq2/miRNA/GSE43732_miRNA_Multimir.R
new file mode 100644
index 0000000000000000000000000000000000000000..ae5627db9110831b6f8004baa203eb86fd2ca1c8
--- /dev/null
+++ b/DGE/Deseq2/miRNA/GSE43732_miRNA_Multimir.R
@@ -0,0 +1,137 @@
+# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
+################################################################
+#   Differential expression analysis with limma
+library(GEOquery)
+library(limma)
+library(umap)
+
+# load series and platform data from GEO
+
+gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE)
+if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1
+gset <- gset[[idx]]
+
+# make proper column names to match toptable 
+fvarLabels(gset) <- make.names(fvarLabels(gset))
+
+# group membership for all samples
+gsms <- paste0("11110000111100001111000011110000111100001111000011",
+        "11000011110000111100001111000011110000111100001111",
+        "00001111000011110000111100000111100011110000111100",
+        "00111100001011110000111100001111000011110000111100",
+        "00111000111100001111000011100011110000")
+sml <- strsplit(gsms, split="")[[1]]
+
+# log2 transformation
+ex <- exprs(gset)
+qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
+LogC <- (qx[5] > 100) ||
+          (qx[6]-qx[1] > 50 && qx[2] > 0)
+if (LogC) { ex[which(ex <= 0)] <- NaN
+  exprs(gset) <- log2(ex) }
+
+# assign samples to groups and set up design matrix
+gs <- factor(sml)
+groups <- make.names(c("Normal","Cancer"))
+levels(gs) <- groups
+gset$group <- gs
+design <- model.matrix(~group + 0, gset)
+colnames(design) <- levels(gs)
+
+gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
+
+fit <- lmFit(gset, design)  # fit linear model
+
+# set up contrasts of interest and recalculate model coefficients
+cts <- paste(groups[1], groups[2], sep="-")
+cont.matrix <- makeContrasts(contrasts=cts, levels=design)
+fit2 <- contrasts.fit(fit, cont.matrix)
+
+# compute statistics and table of top significant genes
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID"))
+
+################ GO #########################
+
+
+
+######################### MULTIMIR ###################
+mirnames <- tT$ID
+
+library(multiMiR)
+multimir_results <- get_multimir(org     = 'hsa',
+                                 mirna   = mirnames,
+                                 table   = 'validated',
+                                 summary = TRUE)
+head(multimir_results@data)
+
+targets <- multimir_results@data
+excel_file <- "C:/Users/Emre/Desktop/Bioinformatik/Bioinformatik 2.Semester Master/Data Science in the Life Sciences/Project/multimir_results.xlsx"
+write_xlsx(targets, path = excel_file )
+
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+  ylab = "Number of genes", main = "P-adj value distribution")
+
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1)
+
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+  highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+
+# MD plot (log fold change vs mean log expression)
+# highlight statistically significant (p-adj < 0.05) probes
+plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
+abline(h=0)
+
+################################################################
+# General expression data analysis
+ex <- exprs(gset)
+
+# box-and-whisker plot
+dev.new(width=3+ncol(gset)/6, height=5)
+ord <- order(gs)  # order samples by group
+palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
+          "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
+par(mar=c(7,4,2,1))
+title <- paste ("GSE43732", "/", annotation(gset), sep ="")
+boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
+legend("topleft", groups, fill=palette(), bty="n")
+dev.off()
+
+# expression value distribution
+par(mar=c(4,4,2,1))
+title <- paste ("GSE43732", "/", annotation(gset), " value distribution", sep ="")
+plotDensities(ex, group=gs, main=title, legend ="topright")
+
+# UMAP plot (dimensionality reduction)
+ex <- na.omit(ex) # eliminate rows with NAs
+ex <- ex[!duplicated(ex), ]  # remove duplicates
+ump <- umap(t(ex), n_neighbors = 15, random_state = 123)
+par(mar=c(3,3,2,6), xpd=TRUE)
+plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5)
+legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20,
+col=1:nlevels(gs), title="Group", pt.cex=1.5)
+library("maptools")  # point labels without overlaps
+pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
+
+# mean-variance trend, helps to see if precision weights are needed
+plotSA(fit2, main="Mean variance trend, GSE43732")
+
diff --git a/DGE/Deseq2/miRNA/GSE55857 miRNA errors.R b/DGE/Deseq2/miRNA/GSE55857 miRNA errors.R
new file mode 100644
index 0000000000000000000000000000000000000000..08e8e85c7f09a00eb473cfa87c796e41889a2b79
--- /dev/null
+++ b/DGE/Deseq2/miRNA/GSE55857 miRNA errors.R	
@@ -0,0 +1,78 @@
+library(GEOquery)
+library(tidyverse)
+library(magrittr)
+library(DESeq2)
+
+miRNA_GSE55856 <- getGEO("GSE55856", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
+GSE55856 <- miRNA_GSE55856[["GSE55856_series_matrix.txt.gz"]]@assayData[["exprs"]]
+GSE55856 <- cbind(colx,GSE55856)
+
+
+# Convert matrix to a dataframe
+df_GSE55856 <- as.data.frame(GSE55856)
+
+# Extract only the count matrix
+count_matrix <- df_GSE55856[, 1:ncol(df_GSE55856)]
+
+# Sample information
+sample_names <- colnames(count_matrix)
+conditions = c("normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", 
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal",
+               "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal")
+
+sample_info <- data.frame(Sample = sample_names, condition = conditions)
+
+#### SKIRMISH FOR DESEQ2 DATASET ###########
+#make NA's to 0
+count_matrix <- as.matrix(sapply(count_matrix, as.numeric))
+count_matrix[is.na(count_matrix)] <- 0
+count_matrix <- round(count_matrix)
+
+#get rownames of previous data
+row_names_GSE55856 <- rownames(GSE55856)
+
+# Set gene_id to rownames again
+rownames(count_matrix) <- row_names_GSE55856
+
+#negative values to abs() --> simulate counts
+#count_matrix <- abs(count_matrix)
+
+
+# Create the DESeqDataSet using the updated sample_info
+dds <- DESeqDataSetFromMatrix(countData = count_matrix,
+                              colData = sample_info,
+                              design = ~ condition)
+
+# Perform differential gene expression analysis
+dds <- DESeq(dds)
+
+resultsNames(dds)
+res <- results(dds, name="condition_tumor_vs_normal")
+summary(res)
+
+# Adjustment to overcome NA-error --> exclude rows containing NA's
+filtered_indices <- which(res$log2FoldChange > 1 & res$padj < 0.05 & !is.na(res$padj))
+filtered_downregulated_genes <- res[filtered_indices, ]
+gene_names <- rownames(filtered_downregulated_genes)
+gene_names
+
+rld <- rlog(dds, blind=TRUE) 
+PCA1 <- plotPCA(rld, intgroup = "condition")
+PCA1
diff --git a/DGE/Deseq2/miRNA/GSE6188 miRNA.R b/DGE/Deseq2/miRNA/GSE6188 miRNA.R
new file mode 100644
index 0000000000000000000000000000000000000000..634b4f55d1f205a568e82fd6460efe07f84be81e
--- /dev/null
+++ b/DGE/Deseq2/miRNA/GSE6188 miRNA.R	
@@ -0,0 +1,110 @@
+library(GEOquery)
+library(tidyverse)
+library(dplyr)
+library(PCAtools)
+library(DESeq2)
+
+miRNA_GSE6188 <- getGEO("GSE6188", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
+if (length(miRNA_GSE6188) > 1) idx <- grep("GPL4508", attr(miRNA_GSE6188, "names")) else idx <- 1
+miRNA_GSE6188 <- miRNA_GSE6188[[idx]]
+GSE6188 <- miRNA_GSE6188@assayData[["exprs"]]
+colx<- miRNA_GSE6188@featureData@data[["Name"]]
+GSE6188 <- cbind(colx,GSE6188)
+
+# Convert matrix to a dataframe
+df_GSE6188 <- as.data.frame(GSE6188)
+df_GSE6188[is.na(df_GSE6188)] <- 0
+
+
+# Group the data frame by 'colx' and calculate the mean for other columns
+df_mean <- df_GSE6188 %>%
+  group_by(colx) %>%
+  summarise(everything(), mean, na.rm = TRUE)
+
+
+# Extract only the count matrix
+count_matrix <- df_GSE43732[, 1:ncol(df_GSE43732)]
+
+# Sample information
+sample_names <- colnames(count_matrix)
+conditions = c("cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",  
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",  
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "normal", 
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", 
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "normal", "normal", "normal",
+               "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal")
+
+sample_info <- data.frame(Sample = sample_names, condition = conditions)
+
+#### SKIRMISH FOR DESEQ2 DATASET ###########
+#make NA's to 0
+count_matrix <- as.matrix(sapply(count_matrix, as.numeric))
+count_matrix[is.na(count_matrix)] <- 0
+count_matrix <- round(count_matrix)
+
+#get rownames of previous data
+row_names_GSE43732 <- rownames(GSE43732)
+
+# Set gene_id to rownames again
+rownames(count_matrix) <- row_names_GSE43732
+
+#negative values to abs() --> simulate counts
+count_matrix <- abs(count_matrix)
+
+
+# Create the DESeqDataSet using the updated sample_info
+dds <- DESeqDataSetFromMatrix(countData = count_matrix,
+                              colData = sample_info,
+                              design = ~ condition)
+
+# Perform differential gene expression analysis
+dds <- DESeq(dds)
+resultsNames(dds)
+
+par(mar=c(8,5,2,2))
+boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
+
+res <- results(dds, name="condition_normal_vs_cancer")
+summary(res)
+
+# Adjustment to overcome NA-error --> exclude rows containing NA's
+filtered_indices <- which(res$log2FoldChange > 1 & res$padj < 0.05 & !is.na(res$padj))
+filtered_downregulated_genes <- res[filtered_indices, ]
+gene_names <- rownames(filtered_downregulated_genes)
+gene_names
+
+rld <- rlog(dds, blind=TRUE) 
+PCA1 <- plotPCA(rld, intgroup = "condition")
+PCA1
+
+vst <- assay(vst(dds))
+p <- pca(vst, removeVar = 0.1)
+screeplot(p, axisLabSize = 18, titleLabSize = 22)
+
+biplot(p, showLoadings = TRUE,
+       labSize = 5, pointSize = 5, sizeLoadingsNames = 5)
\ No newline at end of file
diff --git a/DGE/Limma/mRNA/GSE20347.R b/DGE/Limma/mRNA/GSE20347.R
new file mode 100644
index 0000000000000000000000000000000000000000..209ebaadf86515b5a9212fa8c4193e97ea280d58
--- /dev/null
+++ b/DGE/Limma/mRNA/GSE20347.R
@@ -0,0 +1,113 @@
+# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
+################################################################
+#   Differential expression analysis with limma
+library(GEOquery)
+library(limma)
+library(umap)
+
+# load series and platform data from GEO
+
+gset <- getGEO("GSE20347", GSEMatrix =TRUE, AnnotGPL=TRUE)
+if (length(gset) > 1) idx <- grep("GPL571", attr(gset, "names")) else idx <- 1
+gset <- gset[[idx]]
+
+# make proper column names to match toptable 
+fvarLabels(gset) <- make.names(fvarLabels(gset))
+
+# group membership for all samples
+gsms <- "0000000000000000011111111111111111"
+sml <- strsplit(gsms, split="")[[1]]
+
+# log2 transformation
+ex <- exprs(gset)
+qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
+LogC <- (qx[5] > 100) ||
+  (qx[6]-qx[1] > 50 && qx[2] > 0)
+if (LogC) { ex[which(ex <= 0)] <- NaN
+exprs(gset) <- log2(ex) }
+
+# assign samples to groups and set up design matrix
+gs <- factor(sml)
+groups <- make.names(c("normal","cancer"))
+levels(gs) <- groups
+gset$group <- gs
+design <- model.matrix(~group + 0, gset)
+colnames(design) <- levels(gs)
+
+gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
+
+fit <- lmFit(gset, design)  # fit linear model
+
+# set up contrasts of interest and recalculate model coefficients
+cts <- c(paste(groups[1],"-",groups[2],sep=""))
+cont.matrix <- makeContrasts(contrasts=cts, levels=design)
+fit2 <- contrasts.fit(fit, cont.matrix)
+
+# compute statistics and table of top significant genes
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","GB_ACC","SPOT_ID","Gene.Symbol","Gene.symbol","Gene.title"))
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+     ylab = "Number of genes", main = "P-adj value distribution")
+
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1)
+
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+            highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+
+# MD plot (log fold change vs mean log expression)
+# highlight statistically significant (p-adj < 0.05) probes
+plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
+abline(h=0)
+
+################################################################
+# General expression data analysis
+ex <- exprs(gset)
+
+# box-and-whisker plot
+dev.new(width=3+ncol(gset)/6, height=5)
+ord <- order(gs)  # order samples by group
+palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
+          "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
+par(mar=c(7,4,2,1))
+title <- paste ("GSE20347", "/", annotation(gset), sep ="")
+boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
+legend("topleft", groups, fill=palette(), bty="n")
+dev.off()
+
+# expression value distribution
+par(mar=c(4,4,2,1))
+title <- paste ("GSE20347", "/", annotation(gset), " value distribution", sep ="")
+plotDensities(ex, group=gs, main=title, legend ="topright")
+
+# UMAP plot (dimensionality reduction)
+ex <- na.omit(ex) # eliminate rows with NAs
+ex <- ex[!duplicated(ex), ]  # remove duplicates
+ump <- umap(t(ex), n_neighbors = 14, random_state = 123)
+par(mar=c(3,3,2,6), xpd=TRUE)
+plot(ump$layout, main="UMAP plot, nbrs=14", xlab="", ylab="", col=gs, pch=20, cex=1.5)
+legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20,
+       col=1:nlevels(gs), title="Group", pt.cex=1.5)
+library("maptools")  # point labels without overlaps
+pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
+
+# mean-variance trend, helps to see if precision weights are needed
+plotSA(fit2, main="Mean variance trend, GSE20347")
diff --git a/DGE/Limma/mRNA/GSE29001.R b/DGE/Limma/mRNA/GSE29001.R
new file mode 100644
index 0000000000000000000000000000000000000000..c08bfe481bc452bf6ff04a285a960bb1c59d7a3d
--- /dev/null
+++ b/DGE/Limma/mRNA/GSE29001.R
@@ -0,0 +1,113 @@
+# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
+################################################################
+#   Differential expression analysis with limma
+library(GEOquery)
+library(limma)
+library(umap)
+
+# load series and platform data from GEO
+
+gset <- getGEO("GSE29001", GSEMatrix =TRUE, AnnotGPL=TRUE)
+if (length(gset) > 1) idx <- grep("GPL571", attr(gset, "names")) else idx <- 1
+gset <- gset[[idx]]
+
+# make proper column names to match toptable 
+fvarLabels(gset) <- make.names(fvarLabels(gset))
+
+# group membership for all samples
+gsms <- "001001100110011001100100110011001100110011001"
+sml <- strsplit(gsms, split="")[[1]]
+
+# log2 transformation
+ex <- exprs(gset)
+qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
+LogC <- (qx[5] > 100) ||
+  (qx[6]-qx[1] > 50 && qx[2] > 0)
+if (LogC) { ex[which(ex <= 0)] <- NaN
+exprs(gset) <- log2(ex) }
+
+# assign samples to groups and set up design matrix
+gs <- factor(sml)
+groups <- make.names(c("normal","cancer"))
+levels(gs) <- groups
+gset$group <- gs
+design <- model.matrix(~group + 0, gset)
+colnames(design) <- levels(gs)
+
+gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
+
+fit <- lmFit(gset, design)  # fit linear model
+
+# set up contrasts of interest and recalculate model coefficients
+cts <- c(paste(groups[1],"-",groups[2],sep=""))
+cont.matrix <- makeContrasts(contrasts=cts, levels=design)
+fit2 <- contrasts.fit(fit, cont.matrix)
+
+# compute statistics and table of top significant genes
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","GB_ACC","SPOT_ID","Gene.Symbol","Gene.symbol","Gene.title"))
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+     ylab = "Number of genes", main = "P-adj value distribution")
+
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1)
+
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+            highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+
+# MD plot (log fold change vs mean log expression)
+# highlight statistically significant (p-adj < 0.05) probes
+plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
+abline(h=0)
+
+################################################################
+# General expression data analysis
+ex <- exprs(gset)
+
+# box-and-whisker plot
+dev.new(width=3+ncol(gset)/6, height=5)
+ord <- order(gs)  # order samples by group
+palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
+          "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
+par(mar=c(7,4,2,1))
+title <- paste ("GSE29001", "/", annotation(gset), sep ="")
+boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
+legend("topleft", groups, fill=palette(), bty="n")
+dev.off()
+
+# expression value distribution
+par(mar=c(4,4,2,1))
+title <- paste ("GSE29001", "/", annotation(gset), " value distribution", sep ="")
+plotDensities(ex, group=gs, main=title, legend ="topright")
+
+# UMAP plot (dimensionality reduction)
+ex <- na.omit(ex) # eliminate rows with NAs
+ex <- ex[!duplicated(ex), ]  # remove duplicates
+ump <- umap(t(ex), n_neighbors = 15, random_state = 123)
+par(mar=c(3,3,2,6), xpd=TRUE)
+plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5)
+legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20,
+       col=1:nlevels(gs), title="Group", pt.cex=1.5)
+library("maptools")  # point labels without overlaps
+pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
+
+# mean-variance trend, helps to see if precision weights are needed
+plotSA(fit2, main="Mean variance trend, GSE29001")
diff --git a/DGE/Limma/mRNA/GSE70409.R b/DGE/Limma/mRNA/GSE70409.R
new file mode 100644
index 0000000000000000000000000000000000000000..d0571bc2322d9005c0dd72ead9a98136e55a78ad
--- /dev/null
+++ b/DGE/Limma/mRNA/GSE70409.R
@@ -0,0 +1,113 @@
+# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
+################################################################
+#   Differential expression analysis with limma
+library(GEOquery)
+library(limma)
+library(umap)
+
+# load series and platform data from GEO
+
+gset <- getGEO("GSE70409", GSEMatrix =TRUE, AnnotGPL=FALSE)
+if (length(gset) > 1) idx <- grep("GPL13287", attr(gset, "names")) else idx <- 1
+gset <- gset[[idx]]
+
+# make proper column names to match toptable 
+fvarLabels(gset) <- make.names(fvarLabels(gset))
+
+# group membership for all samples
+gsms <- "0101010101010101010101010101011111"
+sml <- strsplit(gsms, split="")[[1]]
+
+# log2 transformation
+ex <- exprs(gset)
+qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
+LogC <- (qx[5] > 100) ||
+  (qx[6]-qx[1] > 50 && qx[2] > 0)
+if (LogC) { ex[which(ex <= 0)] <- NaN
+exprs(gset) <- log2(ex) }
+
+# assign samples to groups and set up design matrix
+gs <- factor(sml)
+groups <- make.names(c("normal","cancer"))
+levels(gs) <- groups
+gset$group <- gs
+design <- model.matrix(~group + 0, gset)
+colnames(design) <- levels(gs)
+
+gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
+
+fit <- lmFit(gset, design)  # fit linear model
+
+# set up contrasts of interest and recalculate model coefficients
+cts <- c(paste(groups[1],"-",groups[2],sep=""))
+cont.matrix <- makeContrasts(contrasts=cts, levels=design)
+fit2 <- contrasts.fit(fit, cont.matrix)
+
+# compute statistics and table of top significant genes
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","GB_ACC","SEQUENCE","SPOT_ID"))
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+     ylab = "Number of genes", main = "P-adj value distribution")
+
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1)
+
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+            highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+
+# MD plot (log fold change vs mean log expression)
+# highlight statistically significant (p-adj < 0.05) probes
+plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
+abline(h=0)
+
+################################################################
+# General expression data analysis
+ex <- exprs(gset)
+
+# box-and-whisker plot
+dev.new(width=3+ncol(gset)/6, height=5)
+ord <- order(gs)  # order samples by group
+palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
+          "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
+par(mar=c(7,4,2,1))
+title <- paste ("GSE70409", "/", annotation(gset), sep ="")
+boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
+legend("topleft", groups, fill=palette(), bty="n")
+dev.off()
+
+# expression value distribution
+par(mar=c(4,4,2,1))
+title <- paste ("GSE70409", "/", annotation(gset), " value distribution", sep ="")
+plotDensities(ex, group=gs, main=title, legend ="topright")
+
+# UMAP plot (dimensionality reduction)
+ex <- na.omit(ex) # eliminate rows with NAs
+ex <- ex[!duplicated(ex), ]  # remove duplicates
+ump <- umap(t(ex), n_neighbors = 14, random_state = 123)
+par(mar=c(3,3,2,6), xpd=TRUE)
+plot(ump$layout, main="UMAP plot, nbrs=14", xlab="", ylab="", col=gs, pch=20, cex=1.5)
+legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20,
+       col=1:nlevels(gs), title="Group", pt.cex=1.5)
+library("maptools")  # point labels without overlaps
+pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
+
+# mean-variance trend, helps to see if precision weights are needed
+plotSA(fit2, main="Mean variance trend, GSE70409")
\ No newline at end of file
diff --git a/DGE/Limma/miRNA/.Rhistory b/DGE/Limma/miRNA/.Rhistory
new file mode 100644
index 0000000000000000000000000000000000000000..e03d32136de402b075be7dd28946957cb28d8937
--- /dev/null
+++ b/DGE/Limma/miRNA/.Rhistory
@@ -0,0 +1,512 @@
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID"))
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+ylab = "Number of genes", main = "P-adj value distribution")
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1)
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+# MD plot (log fold change vs mean log expression)
+# highlight statistically significant (p-adj < 0.05) probes
+plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
+abline(h=0)
+################################################################
+# General expression data analysis
+ex <- exprs(gset)
+# box-and-whisker plot
+dev.new(width=3+ncol(gset)/6, height=5)
+ord <- order(gs)  # order samples by group
+palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
+"#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
+par(mar=c(7,4,2,1))
+title <- paste ("GSE43732", "/", annotation(gset), sep ="")
+boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
+legend("topleft", groups, fill=palette(), bty="n")
+dev.off()
+# expression value distribution
+par(mar=c(4,4,2,1))
+title <- paste ("GSE43732", "/", annotation(gset), " value distribution", sep ="")
+plotDensities(ex, group=gs, main=title, legend ="topright")
+# UMAP plot (dimensionality reduction)
+ex <- na.omit(ex) # eliminate rows with NAs
+ex <- ex[!duplicated(ex), ]  # remove duplicates
+ump <- umap(t(ex), n_neighbors = 15, random_state = 123)
+par(mar=c(3,3,2,6), xpd=TRUE)
+plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5)
+legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20,
+col=1:nlevels(gs), title="Group", pt.cex=1.5)
+library("maptools")  # point labels without overlaps
+pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
+# mean-variance trend, helps to see if precision weights are needed
+plotSA(fit2, main="Mean variance trend, GSE43732")
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+ggplot(fit2, aes(x = logFC, y = -log10p)) +
+geom_point(color = "grey", alpha = 0.8) +
+geom_point(data = subset(volcano_df, logFC > logFC_threshold & -log10p > p_threshold),
+color = "red", alpha = 0.8) +
+xlim(c(-max(abs(fold_changes)), max(abs(fold_changes)))) +
+ylim(c(0, max(-log10(p_values)))) +
+labs(x = "Log2 Fold Change", y = "-log10 p-value", title = "Volcano Plot") +
+theme_minimal()
+# Example code
+library(ggplot)
+# Example code
+library(ggplot2)
+ggplot(fit2, aes(x = logFC, y = -log10p)) +
+geom_point(color = "grey", alpha = 0.8) +
+geom_point(data = subset(volcano_df, logFC > logFC_threshold & -log10p > p_threshold),
+color = "red", alpha = 0.8) +
+xlim(c(-max(abs(fold_changes)), max(abs(fold_changes)))) +
+ylim(c(0, max(-log10(p_values)))) +
+labs(x = "Log2 Fold Change", y = "-log10 p-value", title = "Volcano Plot") +
+theme_minimal()
+library(GEOquery)
+library(tidyverse)
+library(magrittr)
+library(multiMiR)
+library(writexl)
+miRNA_GSE43732 <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T)
+if (length(miRNA_GSE43732) > 1) idx <- grep("GPL4508", attr(miRNA_GSE43732, "names")) else idx <- 1
+miRNA_GSE43732 <- miRNA_GSE43732[[idx]]
+GSE43732 <- miRNA_GSE43732@assayData[["exprs"]]
+colx<- miRNA_GSE43732@featureData@data[["Name"]]
+GSE43732 <- cbind(colx,GSE43732)
+# Convert matrix to a dataframe
+df_GSE43732 <- as.data.frame(GSE43732)
+library(DESeq2)
+# Extract only the count matrix
+count_matrix <- df_GSE43732[, 1:ncol(df_GSE43732)]
+# Sample information
+sample_names <- colnames(count_matrix)
+conditions = c("cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "normal", "normal", "normal",
+"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal")
+sample_info <- data.frame(Sample = sample_names, condition = conditions)
+#### SKIRMISH FOR DESEQ2 DATASET ###########
+#make NA's to 0
+count_matrix <- as.matrix(sapply(count_matrix, as.numeric))
+count_matrix[is.na(count_matrix)] <- 0
+count_matrix <- round(count_matrix)
+#get rownames of previous data
+row_names_GSE43732 <- rownames(GSE43732)
+# Set gene_id to rownames again
+rownames(count_matrix) <- row_names_GSE43732
+#negative values to abs() --> simulate counts
+count_matrix <- abs(count_matrix)
+# Create the DESeqDataSet using the updated sample_info
+dds <- DESeqDataSetFromMatrix(countData = count_matrix,
+colData = sample_info,
+design = ~ condition)
+# Perform differential gene expression analysis
+dds <- DESeq(dds)
+resultsNames(dds)
+par(mar=c(8,5,2,2))
+boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
+res <- results(dds, name="condition_normal_vs_cancer")
+summary(res)
+# Adjustment to overcome NA-error --> exclude rows containing NA's
+filtered_indices <- which(res$log2FoldChange > 1 & res$padj < 0.05 & !is.na(res$padj))
+filtered_downregulated_genes <- res[filtered_indices, ]
+gene_names <- rownames(filtered_downregulated_genes)
+gene_names
+results(dds)
+results <- results(dds)
+# Calculate the log2 fold change and -log10 adjusted p-values
+results$log2FoldChange <- with(results, ifelse(baseMean > 0, log2FoldChange, -log2FoldChange))
+results$negLog10padj <- -log10(results$padj)
+# Plot the volcano plot
+plot(results$log2FoldChange, results$negLog10padj,
+xlim = c(-max(abs(results$log2FoldChange)), max(abs(results$log2FoldChange)) + 1),
+ylim = c(0, max(results$negLog10padj) + 1),
+xlab = "Log2 Fold Change",
+ylab = "-log10(Adjusted p-value)",
+main = "Volcano Plot")
+# Calculate the log2 fold change and -log10 adjusted p-values
+results$log2FoldChange <- with(results, ifelse(baseMean > 0, log2FoldChange, -log2FoldChange))
+results$negLog10padj <- -log10(results$padj)
+# Determine the range of log2 fold change values
+logFC_range <- range(results$log2FoldChange, na.rm = TRUE)
+# Plot the volcano plot
+plot(results$log2FoldChange, results$negLog10padj,
+xlim = c(logFC_range[1] - 1, logFC_range[2] + 1),
+ylim = c(0, max(results$negLog10padj) + 1),
+xlab = "Log2 Fold Change",
+ylab = "-log10(Adjusted p-value)",
+main = "Volcano Plot")
+# Calculate the log2 fold change and -log10 adjusted p-values
+results$log2FoldChange <- with(results, ifelse(baseMean > 0, log2FoldChange, -log2FoldChange))
+results$negLog10padj <- -log10(results$padj)
+# Determine the range of -log10 adjusted p-values
+logP_range <- range(results$negLog10padj, na.rm = TRUE)
+# Plot the volcano plot
+plot(results$log2FoldChange, results$negLog10padj,
+xlim = c(min(results$log2FoldChange), max(results$log2FoldChange)),
+ylim = c(logP_range[1] - 1, logP_range[2] + 1),
+xlab = "Log2 Fold Change",
+ylab = "-log10(Adjusted p-value)",
+main = "Volcano Plot")
+# Add points for significantly differentially expressed genes
+significant_genes <- subset(results, padj < 0.05)
+points(significant_genes$log2FoldChange, significant_genes$negLog10padj,
+col = "red", pch = 16)
+# Calculate the log2 fold change and -log10 adjusted p-values
+results$log2FoldChange <- with(results, ifelse(baseMean > 0, log2FoldChange, -log2FoldChange))
+results$negLog10padj <- -log10(results$padj)
+ggplot(res_tableOE_tb, aes(x = log2FoldChange, y = -log10(pvalue))) +
+geom_point(aes(colour = threshold_OE)) +
+geom_text_repel(aes(label = genelabels)) +
+xlab("log2 fold change") +
+ylab("-log10 p-value") +
+theme(legend.position = c(0.8, 0.2),
+plot.title = element_text(size = rel(1.5), hjust = 0.5),
+axis.title = element_text(size = rel(1.25)))
+ggplot(results, aes(x = log2FoldChange, y = -log10(pvalue))) +
+geom_point(aes(colour = threshold_OE)) +
+geom_text_repel(aes(label = genelabels)) +
+xlab("log2 fold change") +
+ylab("-log10 p-value") +
+theme(legend.position = c(0.8, 0.2),
+plot.title = element_text(size = rel(1.5), hjust = 0.5),
+axis.title = element_text(size = rel(1.25)))
+View(results)
+library(GEOquery)
+library(limma)
+library(umap)
+gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE)
+if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1
+gset <- gset[[idx]]
+# make proper column names to match toptable
+fvarLabels(gset) <- make.names(fvarLabels(gset))
+# group membership for all samples
+gsms <- paste0("11110000111100001111000011110000111100001111000011",
+"11000011110000111100001111000011110000111100001111",
+"00001111000011110000111100000111100011110000111100",
+"00111100001011110000111100001111000011110000111100",
+"00111000111100001111000011100011110000")
+sml <- strsplit(gsms, split="")[[1]]
+# log2 transformation
+ex <- exprs(gset)
+qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
+LogC <- (qx[5] > 100) ||
+(qx[6]-qx[1] > 50 && qx[2] > 0)
+if (LogC) { ex[which(ex <= 0)] <- NaN
+exprs(gset) <- log2(ex) }
+# assign samples to groups and set up design matrix
+gs <- factor(sml)
+groups <- make.names(c("Normal","Cancer"))
+levels(gs) <- groups
+gset$group <- gs
+design <- model.matrix(~group + 0, gset)
+colnames(design) <- levels(gs)
+gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
+fit <- lmFit(gset, design)  # fit linear model
+# set up contrasts of interest and recalculate model coefficients
+cts <- paste(groups[1], groups[2], sep="-")
+cont.matrix <- makeContrasts(contrasts=cts, levels=design)
+fit2 <- contrasts.fit(fit, cont.matrix)
+# compute statistics and table of top significant genes
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID"))
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+ylab = "Number of genes", main = "P-adj value distribution")
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1)
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+# MD plot (log fold change vs mean log expression)
+# highlight statistically significant (p-adj < 0.05) probes
+plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
+abline(h=0)
+library(GEOquery)
+library(limma)
+library(umap)
+gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE)
+if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1
+gset <- gset[[idx]]
+# make proper column names to match toptable
+fvarLabels(gset) <- make.names(fvarLabels(gset))
+# group membership for all samples
+gsms <- paste0("11110000111100001111000011110000111100001111000011",
+"11000011110000111100001111000011110000111100001111",
+"00001111000011110000111100000111100011110000111100",
+"00111100001011110000111100001111000011110000111100",
+"00111000111100001111000011100011110000")
+sml <- strsplit(gsms, split="")[[1]]
+# log2 transformation
+ex <- exprs(gset)
+qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
+LogC <- (qx[5] > 100) ||
+(qx[6]-qx[1] > 50 && qx[2] > 0)
+if (LogC) { ex[which(ex <= 0)] <- NaN
+exprs(gset) <- log2(ex) }
+# assign samples to groups and set up design matrix
+gs <- factor(sml)
+groups <- make.names(c("Normal","Cancer"))
+levels(gs) <- groups
+gset$group <- gs
+design <- model.matrix(~group + 0, gset)
+colnames(design) <- levels(gs)
+gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
+fit <- lmFit(gset, design)  # fit linear model
+# set up contrasts of interest and recalculate model coefficients
+cts <- paste(groups[1], groups[2], sep="-")
+cont.matrix <- makeContrasts(contrasts=cts, levels=design)
+fit2 <- contrasts.fit(fit, cont.matrix)
+# compute statistics and table of top significant genes
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID"))
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+ylab = "Number of genes", main = "P-adj value distribution")
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1)
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+library(GEOquery)
+library(limma)
+library(umap)
+gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE)
+if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1
+gset <- gset[[idx]]
+# make proper column names to match toptable
+fvarLabels(gset) <- make.names(fvarLabels(gset))
+# group membership for all samples
+gsms <- paste0("11110000111100001111000011110000111100001111000011",
+"11000011110000111100001111000011110000111100001111",
+"00001111000011110000111100000111100011110000111100",
+"00111100001011110000111100001111000011110000111100",
+"00111000111100001111000011100011110000")
+sml <- strsplit(gsms, split="")[[1]]
+# log2 transformation
+ex <- exprs(gset)
+qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
+LogC <- (qx[5] > 100) ||
+(qx[6]-qx[1] > 50 && qx[2] > 0)
+if (LogC) { ex[which(ex <= 0)] <- NaN
+exprs(gset) <- log2(ex) }
+# assign samples to groups and set up design matrix
+gs <- factor(sml)
+groups <- make.names(c("Normal","Cancer"))
+levels(gs) <- groups
+gset$group <- gs
+design <- model.matrix(~group + 0, gset)
+colnames(design) <- levels(gs)
+gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
+fit <- lmFit(gset, design)  # fit linear model
+# set up contrasts of interest and recalculate model coefficients
+cts <- paste(groups[1], groups[2], sep="-")
+cont.matrix <- makeContrasts(contrasts=cts, levels=design)
+fit2 <- contrasts.fit(fit, cont.matrix)
+# compute statistics and table of top significant genes
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID"))
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+ylab = "Number of genes", main = "P-adj value distribution")
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1)
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+library(GEOquery)
+library(limma)
+library(umap)
+gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE)
+if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1
+gset <- gset[[idx]]
+# make proper column names to match toptable
+fvarLabels(gset) <- make.names(fvarLabels(gset))
+# group membership for all samples
+gsms <- paste0("11110000111100001111000011110000111100001111000011",
+"11000011110000111100001111000011110000111100001111",
+"00001111000011110000111100000111100011110000111100",
+"00111100001011110000111100001111000011110000111100",
+"00111000111100001111000011100011110000")
+sml <- strsplit(gsms, split="")[[1]]
+# log2 transformation
+ex <- exprs(gset)
+qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
+LogC <- (qx[5] > 100) ||
+(qx[6]-qx[1] > 50 && qx[2] > 0)
+if (LogC) { ex[which(ex <= 0)] <- NaN
+exprs(gset) <- log2(ex) }
+# assign samples to groups and set up design matrix
+gs <- factor(sml)
+groups <- make.names(c("Normal","Cancer"))
+levels(gs) <- groups
+gset$group <- gs
+design <- model.matrix(~group + 0, gset)
+colnames(design) <- levels(gs)
+gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
+fit <- lmFit(gset, design)  # fit linear model
+# set up contrasts of interest and recalculate model coefficients
+cts <- paste(groups[1], groups[2], sep="-")
+cont.matrix <- makeContrasts(contrasts=cts, levels=design)
+fit2 <- contrasts.fit(fit, cont.matrix)
+# compute statistics and table of top significant genes
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID"))
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+ylab = "Number of genes", main = "P-adj value distribution")
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1)
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+# MD plot (log fold change vs mean log expression)
+# highlight statistically significant (p-adj < 0.05) probes
+plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
+abline(h=0)
+################################################################
+# General expression data analysis
+ex <- exprs(gset)
+# box-and-whisker plot
+dev.new(width=3+ncol(gset)/6, height=5)
+ord <- order(gs)  # order samples by group
+palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
+"#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
+par(mar=c(7,4,2,1))
+title <- paste ("GSE43732", "/", annotation(gset), sep ="")
+boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
+legend("topleft", groups, fill=palette(), bty="n")
+dev.off()
+# expression value distribution
+par(mar=c(4,4,2,1))
+title <- paste ("GSE43732", "/", annotation(gset), " value distribution", sep ="")
+plotDensities(ex, group=gs, main=title, legend ="topright")
+# UMAP plot (dimensionality reduction)
+ex <- na.omit(ex) # eliminate rows with NAs
+ex <- ex[!duplicated(ex), ]  # remove duplicates
+ump <- umap(t(ex), n_neighbors = 15, random_state = 123)
+par(mar=c(3,3,2,6), xpd=TRUE)
+plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5)
+legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20,
+col=1:nlevels(gs), title="Group", pt.cex=1.5)
+library("maptools")  # point labels without overlaps
+pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
+# mean-variance trend, helps to see if precision weights are needed
+plotSA(fit2, main="Mean variance trend, GSE43732")
+mirnames <- read.csv("C:/Users/Emre/Desktop/miRNAs.csv")
+mirnames <- read.csv("C:/Users/Emre/Desktop/miRNAs.xlsx")
+View(mirnames)
+mirnames <- read.csv("C:/Users/Emre/Desktop/miRNAs.xlsx", header = TRUE)
+library(multiMiR)
+library(readxl)
+mirnames <- read_excel("C:/Users/Emre/Desktop/miRNAs.xlsx", header = TRUE)
+mirnames <- read_excel("C:/Users/Emre/Desktop/miRNAs.xlsx")
+View(mirnames)
+library(multiMiR)
+multimir_results <- get_multimir(org     = 'hsa',
+mirna   = mirnames,
+table   = 'validated',
+summary = TRUE)
+head(multimir_results@data)
+targets <- multimir_results@data
+excel_file <- "C:/Users/Emre/Desktop/multimir_results_cyto.xlsx"
+write_xlsx(targets, path = excel_file )
+library("xlsx")
+library(xlsx)
+install.packages(xlsx)
+install.packages("xlsx")
+library(xlsx)
+write_xlsx(targets, path = excel_file )
+targets <- multimir_results@data
+excel_file <- ("C:/Users/Emre/Desktop/multimir_results_cyto.xlsx")
+library(xlsx)
+write_xlsx(targets, path = excel_file )
+write_xlsx(targets, path = excel_file )
+library(writexl)
+write_xlsx(targets, path = excel_file )
diff --git a/DGE/Limma/miRNA/GSE43732.R b/DGE/Limma/miRNA/GSE43732.R
new file mode 100644
index 0000000000000000000000000000000000000000..b30697e99ef34015d25767f64f2c3eaf432e0a40
--- /dev/null
+++ b/DGE/Limma/miRNA/GSE43732.R
@@ -0,0 +1,114 @@
+library(GEOquery)
+library(limma)
+library(umap)
+
+# load series and platform data from GEO
+
+gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE)
+if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1
+gset <- gset[[idx]]
+
+# make proper column names to match toptable 
+fvarLabels(gset) <- make.names(fvarLabels(gset))
+
+# group membership for all samples
+gsms <- paste0("11110000111100001111000011110000111100001111000011",
+        "11000011110000111100001111000011110000111100001111",
+        "00001111000011110000111100000111100011110000111100",
+        "00111100001011110000111100001111000011110000111100",
+        "00111000111100001111000011100011110000")
+sml <- strsplit(gsms, split="")[[1]]
+
+# log2 transformation
+ex <- exprs(gset)
+qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
+LogC <- (qx[5] > 100) ||
+          (qx[6]-qx[1] > 50 && qx[2] > 0)
+if (LogC) { ex[which(ex <= 0)] <- NaN
+  exprs(gset) <- log2(ex) }
+
+# assign samples to groups and set up design matrix
+gs <- factor(sml)
+groups <- make.names(c("Normal","Cancer"))
+levels(gs) <- groups
+gset$group <- gs
+design <- model.matrix(~group + 0, gset)
+colnames(design) <- levels(gs)
+
+gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
+
+fit <- lmFit(gset, design)  # fit linear model
+
+# set up contrasts of interest and recalculate model coefficients
+cts <- paste(groups[1], groups[2], sep="-")
+cont.matrix <- makeContrasts(contrasts=cts, levels=design)
+fit2 <- contrasts.fit(fit, cont.matrix)
+
+# compute statistics and table of top significant genes
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID"))
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+  ylab = "Number of genes", main = "P-adj value distribution")
+
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1)
+
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+  highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+
+# MD plot (log fold change vs mean log expression)
+# highlight statistically significant (p-adj < 0.05) probes
+plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
+abline(h=0)
+
+################################################################
+# General expression data analysis
+ex <- exprs(gset)
+
+# box-and-whisker plot
+dev.new(width=3+ncol(gset)/6, height=5)
+ord <- order(gs)  # order samples by group
+palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
+          "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
+par(mar=c(7,4,2,1))
+title <- paste ("GSE43732", "/", annotation(gset), sep ="")
+boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
+legend("topleft", groups, fill=palette(), bty="n")
+dev.off()
+
+# expression value distribution
+par(mar=c(4,4,2,1))
+title <- paste ("GSE43732", "/", annotation(gset), " value distribution", sep ="")
+plotDensities(ex, group=gs, main=title, legend ="topright")
+
+# UMAP plot (dimensionality reduction)
+ex <- na.omit(ex) # eliminate rows with NAs
+ex <- ex[!duplicated(ex), ]  # remove duplicates
+ump <- umap(t(ex), n_neighbors = 15, random_state = 123)
+par(mar=c(3,3,2,6), xpd=TRUE)
+plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5)
+legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20,
+col=1:nlevels(gs), title="Group", pt.cex=1.5)
+library("maptools")  # point labels without overlaps
+pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
+
+# mean-variance trend, helps to see if precision weights are needed
+plotSA(fit2, main="Mean variance trend, GSE43732")
\ No newline at end of file
diff --git a/DGE/Limma/miRNA/GSE55857.R b/DGE/Limma/miRNA/GSE55857.R
new file mode 100644
index 0000000000000000000000000000000000000000..c16aa4f6ffa192adbe2d3737c7f402a9669fd1d4
--- /dev/null
+++ b/DGE/Limma/miRNA/GSE55857.R
@@ -0,0 +1,117 @@
+# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
+################################################################
+#   Differential expression analysis with limma
+library(GEOquery)
+library(limma)
+library(umap)
+
+# load series and platform data from GEO
+
+gset <- getGEO("GSE55857", GSEMatrix =TRUE, AnnotGPL=FALSE)
+if (length(gset) > 1) idx <- grep("GPL14613", attr(gset, "names")) else idx <- 1
+gset <- gset[[idx]]
+
+# make proper column names to match toptable 
+fvarLabels(gset) <- make.names(fvarLabels(gset))
+
+# group membership for all samples
+gsms <- paste0("10101010101010101010101010101010101010101010101010",
+               "10101010101010101010101010101010101010101010101010",
+               "10101010101010101010101010101010101010101010101010",
+               "10101010101010101010101010101010101010101010101010",
+               "1010101010101010")
+sml <- strsplit(gsms, split="")[[1]]
+
+# log2 transformation
+ex <- exprs(gset)
+qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
+LogC <- (qx[5] > 100) ||
+  (qx[6]-qx[1] > 50 && qx[2] > 0)
+if (LogC) { ex[which(ex <= 0)] <- NaN
+exprs(gset) <- log2(ex) }
+
+# assign samples to groups and set up design matrix
+gs <- factor(sml)
+groups <- make.names(c("cancer","normal"))
+levels(gs) <- groups
+gset$group <- gs
+design <- model.matrix(~group + 0, gset)
+colnames(design) <- levels(gs)
+
+gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
+
+fit <- lmFit(gset, design)  # fit linear model
+
+# set up contrasts of interest and recalculate model coefficients
+cts <- c(paste(groups[1],"-",groups[2],sep=""))
+cont.matrix <- makeContrasts(contrasts=cts, levels=design)
+fit2 <- contrasts.fit(fit, cont.matrix)
+
+# compute statistics and table of top significant genes
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","miRNA_ID_LIST","SPOT_ID"))
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+     ylab = "Number of genes", main = "P-adj value distribution")
+
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1)
+
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+            highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+
+# MD plot (log fold change vs mean log expression)
+# highlight statistically significant (p-adj < 0.05) probes
+plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
+abline(h=0)
+
+################################################################
+# General expression data analysis
+ex <- exprs(gset)
+
+# box-and-whisker plot
+dev.new(width=3+ncol(gset)/6, height=5)
+ord <- order(gs)  # order samples by group
+palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
+          "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
+par(mar=c(7,4,2,1))
+title <- paste ("GSE55857", "/", annotation(gset), sep ="")
+boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
+legend("topleft", groups, fill=palette(), bty="n")
+dev.off()
+
+# expression value distribution
+par(mar=c(4,4,2,1))
+title <- paste ("GSE55857", "/", annotation(gset), " value distribution", sep ="")
+plotDensities(ex, group=gs, main=title, legend ="topright")
+
+# UMAP plot (dimensionality reduction)
+ex <- na.omit(ex) # eliminate rows with NAs
+ex <- ex[!duplicated(ex), ]  # remove duplicates
+ump <- umap(t(ex), n_neighbors = 15, random_state = 123)
+par(mar=c(3,3,2,6), xpd=TRUE)
+plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5)
+legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20,
+       col=1:nlevels(gs), title="Group", pt.cex=1.5)
+library("maptools")  # point labels without overlaps
+pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
+
+# mean-variance trend, helps to see if precision weights are needed
+plotSA(fit2, main="Mean variance trend, GSE55857")
\ No newline at end of file
diff --git a/DGE/Limma/miRNA/GSE6188.R b/DGE/Limma/miRNA/GSE6188.R
new file mode 100644
index 0000000000000000000000000000000000000000..0fa5a582fdd4f4f4bf6138ea522a67cbebbebc13
--- /dev/null
+++ b/DGE/Limma/miRNA/GSE6188.R
@@ -0,0 +1,118 @@
+# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
+################################################################
+#   Differential expression analysis with limma
+library(GEOquery)
+library(limma)
+library(umap)
+
+# load series and platform data from GEO
+
+gset <- getGEO("GSE6188", GSEMatrix =TRUE, AnnotGPL=FALSE)
+if (length(gset) > 1) idx <- grep("GPL4508", attr(gset, "names")) else idx <- 1
+gset <- gset[[idx]]
+
+# make proper column names to match toptable 
+fvarLabels(gset) <- make.names(fvarLabels(gset))
+
+# group membership for all samples
+gsms <- paste0("00000011111100110000001111110000000000000011111111",
+               "11110000000000111111111100000000000000111111111111",
+               "11000111100011111110011001100110011001100000011110",
+               "01100110101101001101100110011001111001100110011001",
+               "10011001100111111111111111111111111111111111111111",
+               "1111111")
+sml <- strsplit(gsms, split="")[[1]]
+
+# log2 transformation
+ex <- exprs(gset)
+qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
+LogC <- (qx[5] > 100) ||
+  (qx[6]-qx[1] > 50 && qx[2] > 0)
+if (LogC) { ex[which(ex <= 0)] <- NaN
+exprs(gset) <- log2(ex) }
+
+# assign samples to groups and set up design matrix
+gs <- factor(sml)
+groups <- make.names(c("Normal","Cancer"))
+levels(gs) <- groups
+gset$group <- gs
+design <- model.matrix(~group + 0, gset)
+colnames(design) <- levels(gs)
+
+gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
+
+fit <- lmFit(gset, design)  # fit linear model
+
+# set up contrasts of interest and recalculate model coefficients
+cts <- c(paste(groups[1],"-",groups[2],sep=""))
+cont.matrix <- makeContrasts(contrasts=cts, levels=design)
+fit2 <- contrasts.fit(fit, cont.matrix)
+
+# compute statistics and table of top significant genes
+fit2 <- eBayes(fit2, 0.01)
+tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
+
+tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","ORF","SPOT_ID"))
+write.table(tT, file=stdout(), row.names=F, sep="\t")
+
+# Visualize and quality control test results.
+# Build histogram of P-values for all genes. Normal test
+# assumption is that most genes are not differentially expressed.
+tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
+hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
+     ylab = "Number of genes", main = "P-adj value distribution")
+
+# summarize test results as "up", "down" or "not expressed"
+dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1)
+
+# Venn diagram of results
+vennDiagram(dT, circle.col=palette())
+
+# create Q-Q plot for t-statistic
+t.good <- which(!is.na(fit2$F)) # filter out bad probes
+qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
+
+# volcano plot (log P-value vs log fold change)
+colnames(fit2) # list contrast names
+ct <- 1        # choose contrast of interest
+volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
+            highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
+
+# MD plot (log fold change vs mean log expression)
+# highlight statistically significant (p-adj < 0.05) probes
+plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
+abline(h=0)
+
+################################################################
+# General expression data analysis
+ex <- exprs(gset)
+
+# box-and-whisker plot
+dev.new(width=3+ncol(gset)/6, height=5)
+ord <- order(gs)  # order samples by group
+palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
+          "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
+par(mar=c(7,4,2,1))
+title <- paste ("GSE6188", "/", annotation(gset), sep ="")
+boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
+legend("topleft", groups, fill=palette(), bty="n")
+dev.off()
+
+# expression value distribution
+par(mar=c(4,4,2,1))
+title <- paste ("GSE6188", "/", annotation(gset), " value distribution", sep ="")
+plotDensities(ex, group=gs, main=title, legend ="topright")
+
+# UMAP plot (dimensionality reduction)
+ex <- na.omit(ex) # eliminate rows with NAs
+ex <- ex[!duplicated(ex), ]  # remove duplicates
+ump <- umap(t(ex), n_neighbors = 15, random_state = 123)
+par(mar=c(3,3,2,6), xpd=TRUE)
+plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5)
+legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20,
+       col=1:nlevels(gs), title="Group", pt.cex=1.5)
+library("maptools")  # point labels without overlaps
+pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6)
+
+# mean-variance trend, helps to see if precision weights are needed
+plotSA(fit2, main="Mean variance trend, GSE6188")
\ No newline at end of file
diff --git a/Data_Annotation/GO_enrichment/GO_molecular_function.png b/Data_Annotation/GO_enrichment/GO_molecular_function.png
new file mode 100644
index 0000000000000000000000000000000000000000..98a9cda956855889c0f17b5c045c7f6dc7f66566
Binary files /dev/null and b/Data_Annotation/GO_enrichment/GO_molecular_function.png differ
diff --git a/MultiMIR/multimir_results.xlsx b/MultiMIR/multimir_results.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..4efaed048d3a9e42642d670fc34f188419f4bafb
Binary files /dev/null and b/MultiMIR/multimir_results.xlsx differ
diff --git a/MultiMIR/multimir_results_final.xlsx b/MultiMIR/multimir_results_final.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..848982cc118e450f41332c9cd21caec14050bde5
Binary files /dev/null and b/MultiMIR/multimir_results_final.xlsx differ
diff --git a/project.PNG b/project.PNG
new file mode 100644
index 0000000000000000000000000000000000000000..5468604b154878df572abaf37dcb9746783c6853
Binary files /dev/null and b/project.PNG differ