diff --git a/DGE/Deseq2/mRNA/.Rhistory b/DGE/Deseq2/mRNA/.Rhistory new file mode 100644 index 0000000000000000000000000000000000000000..768671fc8456938ce255bcc0917fc8a238462f73 --- /dev/null +++ b/DGE/Deseq2/mRNA/.Rhistory @@ -0,0 +1,426 @@ +library(GEOquery) +library(tidyverse) +library(magrittr) +mRNA_GSE20347 <- getGEO("GSE20347", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +if (length(mRNA_GSE20347) > 1) idx <- grep("GPL4508", attr(mRNA_GSE20347, "names")) else idx <- 1 +mRNA_GSE20347 <- mRNA_GSE20347[[idx]] +GSE20347<- mRNA_GSE20347@assayData[["exprs"]] +colx<- mRNA_GSE20347@featureData@data[["Gene symbol"]] +GSE20347<- cbind(colx,GSE20347) +# Convert matrix to a dataframe +df_GSE20347 <- as.data.frame(GSE20347) +library(DESeq2) +# Extract only the count matrix +count_matrix <- df_GSE20347[, 2:ncol(df_GSE20347)] +# Sample information +sample_names <- colnames(count_matrix) +# Create the sample_info data frame +conditions <- c(rep("normal adjacent esophageal tissue", 17), rep("esophageal squamous cell carcinoma", length(sample_names) - 17)) +sample_info <- data.frame(Sample = sample_names, Condition = conditions) +#### SKIRMISH FOR DESEQ2 DATASET ########### +count_matrix <- as.matrix(sapply(count_matrix, as.numeric)) +count_matrix <- count_matrix[complete.cases(count_matrix), ] +count_matrix <- round(count_matrix) +#get rownames of previous data +row_names_GSE20347 <- rownames(GSE20347) +# Set gene_id to rownames again +rownames(count_matrix) <- row_names_GSE20347 +# Create the DESeqDataSet using the updated sample_info +dds <- DESeqDataSetFromMatrix(countData = count_matrix, +colData = sample_info, +design = ~ Condition) +# Perform differential gene expression analysis +dds <- DESeq(dds) +resultsNames(dds) +par(mar=c(8,5,2,2)) +boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2) +res <- results(dds, name="Condition_normal.adjacent.esophageal.tissue_vs_esophageal.squamous.cell.carcinoma") +summary(res) +# Adjustment to overcome NA-error --> exclude rows containing NA's +filtered_indices <- which(res$log2FoldChange < 1 & res$log2FoldChange > -1 & res$padj < 0.05 & !is.na(res$padj)) +filtered_downregulated_genes <- res[filtered_indices, ] +gene_names <- rownames(filtered_downregulated_genes) +gene_names +res2 <- results(dds, name = "Condition_normal.adjacent.esophageal.tissue_vs_esophageal.squamous.cell.carcinoma") +# Convert p-value to -log10(p-value) +res2$log10_pvalue <- -log10(res2$pvalue) +# Create the volcano plot +volcano_plot <- ggplot(data = res2, aes(x = log2FoldChange, y = log10_pvalue)) + +geom_point() + +labs(x = "log2 Fold Change", y = "-log10 p-value") + +theme_minimal() +# Display the volcano plot +print(volcano_plot) +res2 <- results(dds, name = "Condition_normal.adjacent.esophageal.tissue_vs_esophageal.squamous.cell.carcinoma") +# Convert p-value to -log10(p-value) +res2$log10_pvalue <- -log10(res2$pvalue) +# Create the volcano plot +volcano_plot <- ggplot(data = res2, aes(x = log2FoldChange, y = log10_pvalue)) + +geom_point() + +labs(x = "log2 Fold Change", y = "-log10 p-value") + +theme_minimal() +library(GEOquery) +library(tidyverse) +library(magrittr) +library(DESeq2) +library(ggrepel) +mRNA_GSE20347 <- getGEO("GSE20347", GSEMatrix = TRUE, AnnotGPL = TRUE, getGPL = TRUE) +if (length(mRNA_GSE20347) > 1) idx <- grep("GPL4508", attr(mRNA_GSE20347, "names")) else idx <- 1 +mRNA_GSE20347 <- mRNA_GSE20347[[idx]] +GSE20347 <- mRNA_GSE20347@assayData[["exprs"]] +colx <- mRNA_GSE20347@featureData@data[["Gene symbol"]] +GSE20347 <- cbind(colx, GSE20347) +# Convert matrix to a dataframe +df_GSE20347 <- as.data.frame(GSE20347) +# Extract only the count matrix +count_matrix <- df_GSE20347[, 2:ncol(df_GSE20347)] +# Sample information +sample_names <- colnames(count_matrix) +# Create the sample_info data frame +conditions <- c(rep("normal adjacent esophageal tissue", 17), rep("esophageal squamous cell carcinoma", length(sample_names) - 17)) +sample_info <- data.frame(Sample = sample_names, Condition = conditions) +#### SKIRMISH FOR DESEQ2 DATASET ########### +count_matrix <- as.matrix(sapply(count_matrix, as.numeric)) +count_matrix <- count_matrix[complete.cases(count_matrix), ] +count_matrix <- round(count_matrix) +# Get rownames of previous data +row_names_GSE20347 <- rownames(GSE20347) +# Set gene_id to rownames again +rownames(count_matrix) <- row_names_GSE20347 +# Create the DESeqDataSet using the updated sample_info +dds <- DESeqDataSetFromMatrix(countData = count_matrix, +colData = sample_info, +design = ~ Condition) +# Perform differential gene expression analysis +dds <- DESeq(dds) +# Volcano plot +res <- results(dds) +res$logPadj <- -log10(res$padj) +res$logFC <- res$log2FoldChange +res$Sig <- ifelse(abs(res$log2FoldChange) > 1 & res$padj < 0.05, "Yes", "No") +# Plot volcano plot +ggplot(res, aes(x = logFC, y = logPadj, color = Sig)) + +geom_point(alpha = 0.6) + +geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") + +geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "blue") + +xlim(c(-max(abs(res$log2FoldChange)), max(abs(res$log2FoldChange)))) + +labs(x = "log2 Fold Change", y = "-log10(Adjusted p-value)", title = "Volcano Plot") + +theme_minimal() + +theme(legend.position = "none") +library(GEOquery) +library(tidyverse) +library(magrittr) +mRNA_GSE20347 <- getGEO("GSE20347", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +if (length(mRNA_GSE20347) > 1) idx <- grep("GPL4508", attr(mRNA_GSE20347, "names")) else idx <- 1 +mRNA_GSE20347 <- mRNA_GSE20347[[idx]] +GSE20347<- mRNA_GSE20347@assayData[["exprs"]] +colx<- mRNA_GSE20347@featureData@data[["Gene symbol"]] +GSE20347<- cbind(colx,GSE20347) +# Convert matrix to a dataframe +df_GSE20347 <- as.data.frame(GSE20347) +library(DESeq2) +# Extract only the count matrix +count_matrix <- df_GSE20347[, 2:ncol(df_GSE20347)] +# Sample information +sample_names <- colnames(count_matrix) +# Create the sample_info data frame +conditions <- c(rep("normal adjacent esophageal tissue", 17), rep("esophageal squamous cell carcinoma", length(sample_names) - 17)) +sample_info <- data.frame(Sample = sample_names, Condition = conditions) +#### SKIRMISH FOR DESEQ2 DATASET ########### +count_matrix <- as.matrix(sapply(count_matrix, as.numeric)) +count_matrix <- count_matrix[complete.cases(count_matrix), ] +count_matrix <- round(count_matrix) +#get rownames of previous data +row_names_GSE20347 <- rownames(GSE20347) +# Set gene_id to rownames again +rownames(count_matrix) <- row_names_GSE20347 +# Create the DESeqDataSet using the updated sample_info +dds <- DESeqDataSetFromMatrix(countData = count_matrix, +colData = sample_info, +design = ~ Condition) +# Perform differential gene expression analysis +dds <- DESeq(dds) +resultsNames(dds) +par(mar=c(8,5,2,2)) +boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2) +res <- results(dds, name="Condition_normal.adjacent.esophageal.tissue_vs_esophageal.squamous.cell.carcinoma") +summary(res) +# Adjustment to overcome NA-error --> exclude rows containing NA's +filtered_indices <- which(res$log2FoldChange < 1 & res$log2FoldChange > -1 & res$padj < 0.05 & !is.na(res$padj)) +filtered_downregulated_genes <- res[filtered_indices, ] +gene_names <- rownames(filtered_downregulated_genes) +gene_names +res2 <- as.data.frame(results(dds, name = "Condition_normal.adjacent.esophageal.tissue_vs_esophageal.squamous.cell.carcinoma")) +# Convert p-value to -log10(p-value) +res2$log10_pvalue <- -log10(res2$pvalue) +# Create the volcano plot +volcano_plot <- ggplot(data = res2, aes(x = log2FoldChange, y = log10_pvalue)) + +geom_point() + +labs(x = "log2 Fold Change", y = "-log10 p-value") + +theme_minimal() +# Display the volcano plot +print(volcano_plot) +# Add vertical lines for fold change thresholds +volcano_plot <- volcano_plot + +geom_vline(xintercept = c(-1, 1), color = "red") + +geom_hline(yintercept = -log10(0.05), color = "red") +# Display the customized volcano plot +print(volcano_plot) +# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0 +################################################################ +# Differential expression analysis with limma +library(GEOquery) +library(limma) +library(umap) +gset <- getGEO("GSE20347", GSEMatrix =TRUE, AnnotGPL=TRUE) +if (length(gset) > 1) idx <- grep("GPL571", attr(gset, "names")) else idx <- 1 +gset <- gset[[idx]] +# make proper column names to match toptable +fvarLabels(gset) <- make.names(fvarLabels(gset)) +# group membership for all samples +gsms <- "0000000000000000011111111111111111" +sml <- strsplit(gsms, split="")[[1]] +# log2 transformation +ex <- exprs(gset) +qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) +LogC <- (qx[5] > 100) || +(qx[6]-qx[1] > 50 && qx[2] > 0) +if (LogC) { ex[which(ex <= 0)] <- NaN +exprs(gset) <- log2(ex) } +# assign samples to groups and set up design matrix +gs <- factor(sml) +groups <- make.names(c("normal","cancer")) +levels(gs) <- groups +gset$group <- gs +design <- model.matrix(~group + 0, gset) +colnames(design) <- levels(gs) +gset <- gset[complete.cases(exprs(gset)), ] # skip missing values +fit <- lmFit(gset, design) # fit linear model +# set up contrasts of interest and recalculate model coefficients +cts <- c(paste(groups[1],"-",groups[2],sep="")) +cont.matrix <- makeContrasts(contrasts=cts, levels=design) +fit2 <- contrasts.fit(fit, cont.matrix) +# compute statistics and table of top significant genes +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","GB_ACC","SPOT_ID","Gene.Symbol","Gene.symbol","Gene.title")) +write.table(tT, file=stdout(), row.names=F, sep="\t") +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", +ylab = "Number of genes", main = "P-adj value distribution") +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1) +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, +highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) +View(tT2) +View(tT) +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, +highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) +# MD plot (log fold change vs mean log expression) +# highlight statistically significant (p-adj < 0.05) probes +plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1) +abline(h=0) +################################################################ +# General expression data analysis +ex <- exprs(gset) +# box-and-whisker plot +dev.new(width=3+ncol(gset)/6, height=5) +ord <- order(gs) # order samples by group +palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02", +"#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666")) +library(GEOquery) +library(limma) +library(umap) +gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE) +if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1 +gset <- gset[[idx]] +# make proper column names to match toptable +fvarLabels(gset) <- make.names(fvarLabels(gset)) +# group membership for all samples +gsms <- paste0("11110000111100001111000011110000111100001111000011", +"11000011110000111100001111000011110000111100001111", +"00001111000011110000111100000111100011110000111100", +"00111100001011110000111100001111000011110000111100", +"00111000111100001111000011100011110000") +sml <- strsplit(gsms, split="")[[1]] +# log2 transformation +ex <- exprs(gset) +qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) +LogC <- (qx[5] > 100) || +(qx[6]-qx[1] > 50 && qx[2] > 0) +if (LogC) { ex[which(ex <= 0)] <- NaN +exprs(gset) <- log2(ex) } +# assign samples to groups and set up design matrix +gs <- factor(sml) +groups <- make.names(c("Normal","Cancer")) +levels(gs) <- groups +gset$group <- gs +design <- model.matrix(~group + 0, gset) +colnames(design) <- levels(gs) +gset <- gset[complete.cases(exprs(gset)), ] # skip missing values +fit <- lmFit(gset, design) # fit linear model +# set up contrasts of interest and recalculate model coefficients +cts <- paste(groups[1], groups[2], sep="-") +cont.matrix <- makeContrasts(contrasts=cts, levels=design) +fit2 <- contrasts.fit(fit, cont.matrix) +# compute statistics and table of top significant genes +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID")) +write.table(tT, file=stdout(), row.names=F, sep="\t") +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", +ylab = "Number of genes", main = "P-adj value distribution") +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1) +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, +highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) +# MD plot (log fold change vs mean log expression) +# highlight statistically significant (p-adj < 0.05) probes +plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1) +abline(h=0) +################################################################ +# General expression data analysis +ex <- exprs(gset) +# box-and-whisker plot +dev.new(width=3+ncol(gset)/6, height=5) +ord <- order(gs) # order samples by group +palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02", +"#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666")) +par(mar=c(7,4,2,1)) +title <- paste ("GSE43732", "/", annotation(gset), sep ="") +boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord]) +legend("topleft", groups, fill=palette(), bty="n") +dev.off() +# expression value distribution +par(mar=c(4,4,2,1)) +title <- paste ("GSE43732", "/", annotation(gset), " value distribution", sep ="") +plotDensities(ex, group=gs, main=title, legend ="topright") +# UMAP plot (dimensionality reduction) +ex <- na.omit(ex) # eliminate rows with NAs +ex <- ex[!duplicated(ex), ] # remove duplicates +ump <- umap(t(ex), n_neighbors = 15, random_state = 123) +par(mar=c(3,3,2,6), xpd=TRUE) +plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5) +legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20, +col=1:nlevels(gs), title="Group", pt.cex=1.5) +library("maptools") # point labels without overlaps +pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6) +# mean-variance trend, helps to see if precision weights are needed +plotSA(fit2, main="Mean variance trend, GSE43732") +# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0 +################################################################ +# Differential expression analysis with limma +library(GEOquery) +library(limma) +library(umap) +gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE) +if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1 +gset <- gset[[idx]] +# make proper column names to match toptable +fvarLabels(gset) <- make.names(fvarLabels(gset)) +# group membership for all samples +gsms <- paste0("00001111000011110000111100001111000011110000111100", +"00111100001111000011110000111100001111000011110000", +"11110000111100001111000011111000011100001111000011", +"11000011110100001111000011110000111100001111000011", +"11000111000011110000111100011100001111") +sml <- strsplit(gsms, split="")[[1]] +# log2 transformation +ex <- exprs(gset) +qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) +LogC <- (qx[5] > 100) || +(qx[6]-qx[1] > 50 && qx[2] > 0) +if (LogC) { ex[which(ex <= 0)] <- NaN +exprs(gset) <- log2(ex) } +# assign samples to groups and set up design matrix +gs <- factor(sml) +groups <- make.names(c("cancer","normal")) +levels(gs) <- groups +gset$group <- gs +design <- model.matrix(~group + 0, gset) +colnames(design) <- levels(gs) +gset <- gset[complete.cases(exprs(gset)), ] # skip missing values +fit <- lmFit(gset, design) # fit linear model +# set up contrasts of interest and recalculate model coefficients +cts <- paste(groups[1], groups[2], sep="-") +cont.matrix <- makeContrasts(contrasts=cts, levels=design) +fit2 <- contrasts.fit(fit, cont.matrix) +# compute statistics and table of top significant genes +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID")) +write.table(tT, file=stdout(), row.names=F, sep="\t") +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", +ylab = "Number of genes", main = "P-adj value distribution") +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=0) +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, +highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) +# MD plot (log fold change vs mean log expression) +# highlight statistically significant (p-adj < 0.05) probes +plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1) +abline(h=0) +################################################################ +# General expression data analysis +ex <- exprs(gset) +# box-and-whisker plot +dev.new(width=3+ncol(gset)/6, height=5) +ord <- order(gs) # order samples by group +palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02", +"#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666")) +par(mar=c(7,4,2,1)) +title <- paste ("GSE43732", "/", annotation(gset), sep ="") +boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord]) +legend("topleft", groups, fill=palette(), bty="n") +dev.off() +# expression value distribution +par(mar=c(4,4,2,1)) +title <- paste ("GSE43732", "/", annotation(gset), " value distribution", sep ="") +plotDensities(ex, group=gs, main=title, legend ="topright") +# UMAP plot (dimensionality reduction) +ex <- na.omit(ex) # eliminate rows with NAs +ex <- ex[!duplicated(ex), ] # remove duplicates +ump <- umap(t(ex), n_neighbors = 15, random_state = 123) +par(mar=c(3,3,2,6), xpd=TRUE) +plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5) +legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20, +col=1:nlevels(gs), title="Group", pt.cex=1.5) +library("maptools") # point labels without overlaps +pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6) +# mean-variance trend, helps to see if precision weights are needed +plotSA(fit2, main="Mean variance trend, GSE43732") diff --git "a/DGE/Deseq2/mRNA/GSE20347\342\200\213 mRNA\342\200\213.R" "b/DGE/Deseq2/mRNA/GSE20347\342\200\213 mRNA\342\200\213.R" new file mode 100644 index 0000000000000000000000000000000000000000..d6713d804dca6b8f63734f225deb28cc967fad48 --- /dev/null +++ "b/DGE/Deseq2/mRNA/GSE20347\342\200\213 mRNA\342\200\213.R" @@ -0,0 +1,94 @@ +library(GEOquery) +library(tidyverse) +library(magrittr) + +mRNA_GSE20347 <- getGEO("GSE20347", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +if (length(mRNA_GSE20347) > 1) idx <- grep("GPL4508", attr(mRNA_GSE20347, "names")) else idx <- 1 +mRNA_GSE20347 <- mRNA_GSE20347[[idx]] +GSE20347<- mRNA_GSE20347@assayData[["exprs"]] +colx<- mRNA_GSE20347@featureData@data[["Gene symbol"]] +GSE20347<- cbind(colx,GSE20347) + +# Convert matrix to a dataframe +df_GSE20347 <- as.data.frame(GSE20347) + +library(DESeq2) + +# Extract only the count matrix +count_matrix <- df_GSE20347[, 2:ncol(df_GSE20347)] + +# Sample information +sample_names <- colnames(count_matrix) + +# Create the sample_info data frame +conditions <- c(rep("normal adjacent esophageal tissue", 17), rep("esophageal squamous cell carcinoma", length(sample_names) - 17)) +sample_info <- data.frame(Sample = sample_names, Condition = conditions) + +#### SKIRMISH FOR DESEQ2 DATASET ########### +count_matrix <- as.matrix(sapply(count_matrix, as.numeric)) +count_matrix <- count_matrix[complete.cases(count_matrix), ] +count_matrix <- round(count_matrix) + +#get rownames of previous data +row_names_GSE20347 <- rownames(GSE20347) + +# Set gene_id to rownames again +rownames(count_matrix) <- row_names_GSE20347 + +# Create the DESeqDataSet using the updated sample_info +dds <- DESeqDataSetFromMatrix(countData = count_matrix, + colData = sample_info, + design = ~ Condition) + +# Perform differential gene expression analysis +dds <- DESeq(dds) + +resultsNames(dds) + +par(mar=c(8,5,2,2)) +boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2) + +res <- results(dds, name="Condition_normal.adjacent.esophageal.tissue_vs_esophageal.squamous.cell.carcinoma") +summary(res) + +# Adjustment to overcome NA-error --> exclude rows containing NA's +filtered_indices <- which(res$log2FoldChange < 1 & res$log2FoldChange > -1 & res$padj < 0.05 & !is.na(res$padj)) +filtered_downregulated_genes <- res[filtered_indices, ] +gene_names <- rownames(filtered_downregulated_genes) +gene_names + + +res2 <- as.data.frame(results(dds, name = "Condition_normal.adjacent.esophageal.tissue_vs_esophageal.squamous.cell.carcinoma")) + +# Convert p-value to -log10(p-value) +res2$log10_pvalue <- -log10(res2$pvalue) + +# Create the volcano plot +volcano_plot <- ggplot(data = res2, aes(x = log2FoldChange, y = log10_pvalue)) + + geom_point() + + labs(x = "log2 Fold Change", y = "-log10 p-value") + + theme_minimal() + +# Display the volcano plot +print(volcano_plot) + +# Add vertical lines for fold change thresholds +volcano_plot <- volcano_plot + + geom_vline(xintercept = c(-1, 1), color = "red") + + geom_hline(yintercept = -log10(0.05), color = "red") + +# Display the customized volcano plot +print(volcano_plot) + + + +rld <- rlog(dds, blind=TRUE) +PCA1 <- plotPCA(rld, intgroup = "Condition") +PCA1 + +vst <- assay(vst(dds)) +p <- pca(vst, removeVar = 0.1) +screeplot(p, axisLabSize = 18, titleLabSize = 22) + +biplot(p, showLoadings = TRUE, + labSize = 5, pointSize = 5, sizeLoadingsNames = 5) diff --git "a/DGE/Deseq2/mRNA/GSE29001\342\200\213 mRNA\342\200\213.R" "b/DGE/Deseq2/mRNA/GSE29001\342\200\213 mRNA\342\200\213.R" new file mode 100644 index 0000000000000000000000000000000000000000..c8be78f1ee4568c17d6cd3615dc6b254f18013b7 --- /dev/null +++ "b/DGE/Deseq2/mRNA/GSE29001\342\200\213 mRNA\342\200\213.R" @@ -0,0 +1,71 @@ +library(GEOquery) +library(tidyverse) +library(magrittr) + +mRNA_GSE29001 <- getGEO("GSE29001", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +if (length(mRNA_GSE29001) > 1) idx <- grep("GPL4508", attr(mRNA_GSE29001, "names")) else idx <- 1 +mRNA_GSE29001 <- mRNA_GSE29001[[idx]] +GSE29001 <- mRNA_GSE29001@assayData[["exprs"]] +colx <- mRNA_GSE29001@featureData@data[["Gene symbol"]] +GSE29001 <- cbind(colx,GSE29001) + +# Convert matrix to a dataframe +df_GSE29001 <- as.data.frame(GSE29001) + +library(DESeq2) + +# Extract only the count matrix +count_matrix <- df_GSE29001[, 2:ncol(df_GSE29001)] + +# Sample information +sample_names <- colnames(count_matrix) +conditions = c("normal", "normal", "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", "tumor", + "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", + "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", + "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor", "tumor", "normal", "normal", "tumor") + +sample_info <- data.frame(Sample = sample_names, condition = conditions) + +#### SKIRMISH FOR DESEQ2 DATASET ########### +count_matrix <- as.matrix(sapply(count_matrix, as.numeric)) +count_matrix <- count_matrix[complete.cases(count_matrix), ] +count_matrix <- round(count_matrix) + +#get rownames of previous data +row_names_GSE29001 <- rownames(GSE29001) + +# Set gene_id to rownames again +rownames(count_matrix) <- row_names_GSE29001 + +# Create the DESeqDataSet using the updated sample_info +dds <- DESeqDataSetFromMatrix(countData = count_matrix, + colData = sample_info, + design = ~ condition) + +# Perform differential gene expression analysis +dds <- DESeq(dds) + +resultsNames(dds) + +par(mar=c(8,5,2,2)) +boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2) + +res <- results(dds, name="condition_tumor_vs_normal") +summary(res) + +# Adjustment to overcome NA-error --> exclude rows containing NA's +filtered_indices <- which(res$log2FoldChange > 1 & res$padj < 0.05 & !is.na(res$padj)) +filtered_downregulated_genes <- res[filtered_indices, ] +gene_names <- rownames(filtered_downregulated_genes) +gene_names + +rld <- rlog(dds, blind=TRUE) +PCA1 <- plotPCA(rld, intgroup = "condition") +PCA1 + +vst <- assay(vst(dds)) +p <- pca(vst, removeVar = 0.1) +screeplot(p, axisLabSize = 18, titleLabSize = 22) + +biplot(p, showLoadings = TRUE, + labSize = 5, pointSize = 5, sizeLoadingsNames = 5) diff --git "a/DGE/Deseq2/mRNA/GSE70409\342\200\213 mRNA\342\200\213.R" "b/DGE/Deseq2/mRNA/GSE70409\342\200\213 mRNA\342\200\213.R" new file mode 100644 index 0000000000000000000000000000000000000000..06db7568eccf467e15bf67cd422a0e717cc35ccd --- /dev/null +++ "b/DGE/Deseq2/mRNA/GSE70409\342\200\213 mRNA\342\200\213.R" @@ -0,0 +1,71 @@ +library(GEOquery) +library(tidyverse) +library(magrittr) +library(PCAtools) + +mRNA_GSE70409 <- getGEO("GSE70409", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +if (length(mRNA_GSE70409) > 1) idx <- grep("GPL4508", attr(mRNA_GSE70409, "names")) else idx <- 1 +mRNA_GSE70409 <- mRNA_GSE70409[[idx]] +GSE70409 <- mRNA_GSE70409@assayData[["exprs"]] +colx<- mRNA_GSE70409@featureData@data[["Gene_symbol"]] +GSE70409 <- cbind(colx,GSE70409) + +# Convert matrix to a dataframe +df_GSE70409 <- as.data.frame(GSE70409) + +library(DESeq2) + +# Extract only the count matrix +count_matrix <- df_GSE70409[, 2:ncol(df_GSE70409)] + +# Sample information +sample_names <- colnames(count_matrix) +conditions = c("normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor") + +sample_info <- data.frame(Sample = sample_names, condition = conditions) + +#### SKIRMISH FOR DESEQ2 DATASET ########### +count_matrix <- as.matrix(sapply(count_matrix, as.numeric)) +count_matrix <- count_matrix[complete.cases(count_matrix), ] +count_matrix <- round(count_matrix) + +#get rownames of previous data +row_names_GSE70409 <- rownames(GSE70409) + +# Set gene_id to rownames again +rownames(count_matrix) <- row_names_GSE70409 + +# Create the DESeqDataSet using the updated sample_info +dds <- DESeqDataSetFromMatrix(countData = count_matrix, + colData = sample_info, + design = ~ condition) + +# Perform differential gene expression analysis +dds <- DESeq(dds) +resultsNames(dds) + +par(mar=c(8,5,2,2)) +boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2) + +res <- results(dds, name="condition_tumor_vs_normal") +summary(res) + +# Adjustment to overcome NA-error --> exclude rows containing NA's +filtered_indices <- which(res$log2FoldChange > 1 & res$padj < 0.05 & !is.na(res$padj)) +filtered_downregulated_genes <- res[filtered_indices, ] +gene_names <- rownames(filtered_downregulated_genes) +gene_names + + +rld <- rlog(dds, blind=TRUE) +PCA1 <- plotPCA(rld, intgroup = "condition") +PCA1 + +vst <- assay(vst(dds)) +p <- pca(vst, removeVar = 0.1) +screeplot(p, axisLabSize = 18, titleLabSize = 22) + +biplot(p, showLoadings = TRUE, + labSize = 5, pointSize = 5, sizeLoadingsNames = 5) diff --git a/DGE/Deseq2/miRNA/GSE43732_miRNA.R b/DGE/Deseq2/miRNA/GSE43732_miRNA.R new file mode 100644 index 0000000000000000000000000000000000000000..4c9831a5b6a506bf1b5fd985bcb904d6e5fb87a0 --- /dev/null +++ b/DGE/Deseq2/miRNA/GSE43732_miRNA.R @@ -0,0 +1,115 @@ +library(GEOquery) +library(tidyverse) +library(magrittr) +library(multiMiR) +library(writexl) + +miRNA_GSE43732 <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +if (length(miRNA_GSE43732) > 1) idx <- grep("GPL4508", attr(miRNA_GSE43732, "names")) else idx <- 1 +miRNA_GSE43732 <- miRNA_GSE43732[[idx]] +GSE43732 <- miRNA_GSE43732@assayData[["exprs"]] +colx<- miRNA_GSE43732@featureData@data[["Name"]] +GSE43732 <- cbind(colx,GSE43732) + + +# Convert matrix to a dataframe +df_GSE43732 <- as.data.frame(GSE43732) + +library(DESeq2) + +# Extract only the count matrix +count_matrix <- df_GSE43732[, 1:ncol(df_GSE43732)] + +# Sample information +sample_names <- colnames(count_matrix) +conditions = c("cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal") + +sample_info <- data.frame(Sample = sample_names, condition = conditions) + +#### SKIRMISH FOR DESEQ2 DATASET ########### +#make NA's to 0 +count_matrix <- as.matrix(sapply(count_matrix, as.numeric)) +count_matrix[is.na(count_matrix)] <- 0 +count_matrix <- round(count_matrix) + +#get rownames of previous data +row_names_GSE43732 <- rownames(GSE43732) + +# Set gene_id to rownames again +rownames(count_matrix) <- row_names_GSE43732 + +#negative values to abs() --> simulate counts +count_matrix <- abs(count_matrix) + + +# Create the DESeqDataSet using the updated sample_info +dds <- DESeqDataSetFromMatrix(countData = count_matrix, + colData = sample_info, + design = ~ condition) + +# Perform differential gene expression analysis +dds <- DESeq(dds) + +resultsNames(dds) + +par(mar=c(8,5,2,2)) +boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2) + +res <- results(dds, name="condition_normal_vs_cancer") +summary(res) + +# Adjustment to overcome NA-error --> exclude rows containing NA's +filtered_indices <- which(res$log2FoldChange > 1 & res$padj < 0.05 & !is.na(res$padj)) +filtered_downregulated_genes <- res[filtered_indices, ] +gene_names <- rownames(filtered_downregulated_genes) +gene_names + +multimir_results <- get_multimir(org = 'hsa', + mirna = gene_names, + table = 'validated', + summary = TRUE) + +targets <- multimir_results@data +excel_file <- "C:/Users/Emre/Desktop/Bioinformatik/Bioinformatik 2.Semester Master/Data Science in the Life Sciences/Project/multimir_results.xlsx" +write_xlsx(targets, path = excel_file ) + +rld <- rlog(dds, blind=TRUE) +PCA1 <- plotPCA(rld, intgroup = "condition") +PCA1 + +vst <- assay(vst(dds)) +p <- pca(vst, removeVar = 0.1) +screeplot(p, axisLabSize = 18, titleLabSize = 22) + +biplot(p, showLoadings = TRUE, + labSize = 5, pointSize = 5, sizeLoadingsNames = 5) diff --git a/DGE/Deseq2/miRNA/GSE43732_miRNA_Multimir.R b/DGE/Deseq2/miRNA/GSE43732_miRNA_Multimir.R new file mode 100644 index 0000000000000000000000000000000000000000..ae5627db9110831b6f8004baa203eb86fd2ca1c8 --- /dev/null +++ b/DGE/Deseq2/miRNA/GSE43732_miRNA_Multimir.R @@ -0,0 +1,137 @@ +# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0 +################################################################ +# Differential expression analysis with limma +library(GEOquery) +library(limma) +library(umap) + +# load series and platform data from GEO + +gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE) +if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1 +gset <- gset[[idx]] + +# make proper column names to match toptable +fvarLabels(gset) <- make.names(fvarLabels(gset)) + +# group membership for all samples +gsms <- paste0("11110000111100001111000011110000111100001111000011", + "11000011110000111100001111000011110000111100001111", + "00001111000011110000111100000111100011110000111100", + "00111100001011110000111100001111000011110000111100", + "00111000111100001111000011100011110000") +sml <- strsplit(gsms, split="")[[1]] + +# log2 transformation +ex <- exprs(gset) +qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) +LogC <- (qx[5] > 100) || + (qx[6]-qx[1] > 50 && qx[2] > 0) +if (LogC) { ex[which(ex <= 0)] <- NaN + exprs(gset) <- log2(ex) } + +# assign samples to groups and set up design matrix +gs <- factor(sml) +groups <- make.names(c("Normal","Cancer")) +levels(gs) <- groups +gset$group <- gs +design <- model.matrix(~group + 0, gset) +colnames(design) <- levels(gs) + +gset <- gset[complete.cases(exprs(gset)), ] # skip missing values + +fit <- lmFit(gset, design) # fit linear model + +# set up contrasts of interest and recalculate model coefficients +cts <- paste(groups[1], groups[2], sep="-") +cont.matrix <- makeContrasts(contrasts=cts, levels=design) +fit2 <- contrasts.fit(fit, cont.matrix) + +# compute statistics and table of top significant genes +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) + +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID")) + +################ GO ######################### + + + +######################### MULTIMIR ################### +mirnames <- tT$ID + +library(multiMiR) +multimir_results <- get_multimir(org = 'hsa', + mirna = mirnames, + table = 'validated', + summary = TRUE) +head(multimir_results@data) + +targets <- multimir_results@data +excel_file <- "C:/Users/Emre/Desktop/Bioinformatik/Bioinformatik 2.Semester Master/Data Science in the Life Sciences/Project/multimir_results.xlsx" +write_xlsx(targets, path = excel_file ) + +write.table(tT, file=stdout(), row.names=F, sep="\t") + +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", + ylab = "Number of genes", main = "P-adj value distribution") + +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1) + +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) + +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") + +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, + highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) + +# MD plot (log fold change vs mean log expression) +# highlight statistically significant (p-adj < 0.05) probes +plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1) +abline(h=0) + +################################################################ +# General expression data analysis +ex <- exprs(gset) + +# box-and-whisker plot +dev.new(width=3+ncol(gset)/6, height=5) +ord <- order(gs) # order samples by group +palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02", + "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666")) +par(mar=c(7,4,2,1)) +title <- paste ("GSE43732", "/", annotation(gset), sep ="") +boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord]) +legend("topleft", groups, fill=palette(), bty="n") +dev.off() + +# expression value distribution +par(mar=c(4,4,2,1)) +title <- paste ("GSE43732", "/", annotation(gset), " value distribution", sep ="") +plotDensities(ex, group=gs, main=title, legend ="topright") + +# UMAP plot (dimensionality reduction) +ex <- na.omit(ex) # eliminate rows with NAs +ex <- ex[!duplicated(ex), ] # remove duplicates +ump <- umap(t(ex), n_neighbors = 15, random_state = 123) +par(mar=c(3,3,2,6), xpd=TRUE) +plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5) +legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20, +col=1:nlevels(gs), title="Group", pt.cex=1.5) +library("maptools") # point labels without overlaps +pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6) + +# mean-variance trend, helps to see if precision weights are needed +plotSA(fit2, main="Mean variance trend, GSE43732") + diff --git a/DGE/Deseq2/miRNA/GSE55857 miRNA errors.R b/DGE/Deseq2/miRNA/GSE55857 miRNA errors.R new file mode 100644 index 0000000000000000000000000000000000000000..08e8e85c7f09a00eb473cfa87c796e41889a2b79 --- /dev/null +++ b/DGE/Deseq2/miRNA/GSE55857 miRNA errors.R @@ -0,0 +1,78 @@ +library(GEOquery) +library(tidyverse) +library(magrittr) +library(DESeq2) + +miRNA_GSE55856 <- getGEO("GSE55856", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +GSE55856 <- miRNA_GSE55856[["GSE55856_series_matrix.txt.gz"]]@assayData[["exprs"]] +GSE55856 <- cbind(colx,GSE55856) + + +# Convert matrix to a dataframe +df_GSE55856 <- as.data.frame(GSE55856) + +# Extract only the count matrix +count_matrix <- df_GSE55856[, 1:ncol(df_GSE55856)] + +# Sample information +sample_names <- colnames(count_matrix) +conditions = c("normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal", + "normal", "tumor", "normal", "tumor", "normal", "tumor", "normal") + +sample_info <- data.frame(Sample = sample_names, condition = conditions) + +#### SKIRMISH FOR DESEQ2 DATASET ########### +#make NA's to 0 +count_matrix <- as.matrix(sapply(count_matrix, as.numeric)) +count_matrix[is.na(count_matrix)] <- 0 +count_matrix <- round(count_matrix) + +#get rownames of previous data +row_names_GSE55856 <- rownames(GSE55856) + +# Set gene_id to rownames again +rownames(count_matrix) <- row_names_GSE55856 + +#negative values to abs() --> simulate counts +#count_matrix <- abs(count_matrix) + + +# Create the DESeqDataSet using the updated sample_info +dds <- DESeqDataSetFromMatrix(countData = count_matrix, + colData = sample_info, + design = ~ condition) + +# Perform differential gene expression analysis +dds <- DESeq(dds) + +resultsNames(dds) +res <- results(dds, name="condition_tumor_vs_normal") +summary(res) + +# Adjustment to overcome NA-error --> exclude rows containing NA's +filtered_indices <- which(res$log2FoldChange > 1 & res$padj < 0.05 & !is.na(res$padj)) +filtered_downregulated_genes <- res[filtered_indices, ] +gene_names <- rownames(filtered_downregulated_genes) +gene_names + +rld <- rlog(dds, blind=TRUE) +PCA1 <- plotPCA(rld, intgroup = "condition") +PCA1 diff --git a/DGE/Deseq2/miRNA/GSE6188 miRNA.R b/DGE/Deseq2/miRNA/GSE6188 miRNA.R new file mode 100644 index 0000000000000000000000000000000000000000..634b4f55d1f205a568e82fd6460efe07f84be81e --- /dev/null +++ b/DGE/Deseq2/miRNA/GSE6188 miRNA.R @@ -0,0 +1,110 @@ +library(GEOquery) +library(tidyverse) +library(dplyr) +library(PCAtools) +library(DESeq2) + +miRNA_GSE6188 <- getGEO("GSE6188", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +if (length(miRNA_GSE6188) > 1) idx <- grep("GPL4508", attr(miRNA_GSE6188, "names")) else idx <- 1 +miRNA_GSE6188 <- miRNA_GSE6188[[idx]] +GSE6188 <- miRNA_GSE6188@assayData[["exprs"]] +colx<- miRNA_GSE6188@featureData@data[["Name"]] +GSE6188 <- cbind(colx,GSE6188) + +# Convert matrix to a dataframe +df_GSE6188 <- as.data.frame(GSE6188) +df_GSE6188[is.na(df_GSE6188)] <- 0 + + +# Group the data frame by 'colx' and calculate the mean for other columns +df_mean <- df_GSE6188 %>% + group_by(colx) %>% + summarise(everything(), mean, na.rm = TRUE) + + +# Extract only the count matrix +count_matrix <- df_GSE43732[, 1:ncol(df_GSE43732)] + +# Sample information +sample_names <- colnames(count_matrix) +conditions = c("cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "normal", "normal", "normal", + "cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal") + +sample_info <- data.frame(Sample = sample_names, condition = conditions) + +#### SKIRMISH FOR DESEQ2 DATASET ########### +#make NA's to 0 +count_matrix <- as.matrix(sapply(count_matrix, as.numeric)) +count_matrix[is.na(count_matrix)] <- 0 +count_matrix <- round(count_matrix) + +#get rownames of previous data +row_names_GSE43732 <- rownames(GSE43732) + +# Set gene_id to rownames again +rownames(count_matrix) <- row_names_GSE43732 + +#negative values to abs() --> simulate counts +count_matrix <- abs(count_matrix) + + +# Create the DESeqDataSet using the updated sample_info +dds <- DESeqDataSetFromMatrix(countData = count_matrix, + colData = sample_info, + design = ~ condition) + +# Perform differential gene expression analysis +dds <- DESeq(dds) +resultsNames(dds) + +par(mar=c(8,5,2,2)) +boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2) + +res <- results(dds, name="condition_normal_vs_cancer") +summary(res) + +# Adjustment to overcome NA-error --> exclude rows containing NA's +filtered_indices <- which(res$log2FoldChange > 1 & res$padj < 0.05 & !is.na(res$padj)) +filtered_downregulated_genes <- res[filtered_indices, ] +gene_names <- rownames(filtered_downregulated_genes) +gene_names + +rld <- rlog(dds, blind=TRUE) +PCA1 <- plotPCA(rld, intgroup = "condition") +PCA1 + +vst <- assay(vst(dds)) +p <- pca(vst, removeVar = 0.1) +screeplot(p, axisLabSize = 18, titleLabSize = 22) + +biplot(p, showLoadings = TRUE, + labSize = 5, pointSize = 5, sizeLoadingsNames = 5) \ No newline at end of file diff --git a/DGE/Limma/mRNA/GSE20347.R b/DGE/Limma/mRNA/GSE20347.R new file mode 100644 index 0000000000000000000000000000000000000000..209ebaadf86515b5a9212fa8c4193e97ea280d58 --- /dev/null +++ b/DGE/Limma/mRNA/GSE20347.R @@ -0,0 +1,113 @@ +# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0 +################################################################ +# Differential expression analysis with limma +library(GEOquery) +library(limma) +library(umap) + +# load series and platform data from GEO + +gset <- getGEO("GSE20347", GSEMatrix =TRUE, AnnotGPL=TRUE) +if (length(gset) > 1) idx <- grep("GPL571", attr(gset, "names")) else idx <- 1 +gset <- gset[[idx]] + +# make proper column names to match toptable +fvarLabels(gset) <- make.names(fvarLabels(gset)) + +# group membership for all samples +gsms <- "0000000000000000011111111111111111" +sml <- strsplit(gsms, split="")[[1]] + +# log2 transformation +ex <- exprs(gset) +qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) +LogC <- (qx[5] > 100) || + (qx[6]-qx[1] > 50 && qx[2] > 0) +if (LogC) { ex[which(ex <= 0)] <- NaN +exprs(gset) <- log2(ex) } + +# assign samples to groups and set up design matrix +gs <- factor(sml) +groups <- make.names(c("normal","cancer")) +levels(gs) <- groups +gset$group <- gs +design <- model.matrix(~group + 0, gset) +colnames(design) <- levels(gs) + +gset <- gset[complete.cases(exprs(gset)), ] # skip missing values + +fit <- lmFit(gset, design) # fit linear model + +# set up contrasts of interest and recalculate model coefficients +cts <- c(paste(groups[1],"-",groups[2],sep="")) +cont.matrix <- makeContrasts(contrasts=cts, levels=design) +fit2 <- contrasts.fit(fit, cont.matrix) + +# compute statistics and table of top significant genes +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) + +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","GB_ACC","SPOT_ID","Gene.Symbol","Gene.symbol","Gene.title")) +write.table(tT, file=stdout(), row.names=F, sep="\t") + +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", + ylab = "Number of genes", main = "P-adj value distribution") + +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1) + +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) + +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") + +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, + highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) + +# MD plot (log fold change vs mean log expression) +# highlight statistically significant (p-adj < 0.05) probes +plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1) +abline(h=0) + +################################################################ +# General expression data analysis +ex <- exprs(gset) + +# box-and-whisker plot +dev.new(width=3+ncol(gset)/6, height=5) +ord <- order(gs) # order samples by group +palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02", + "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666")) +par(mar=c(7,4,2,1)) +title <- paste ("GSE20347", "/", annotation(gset), sep ="") +boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord]) +legend("topleft", groups, fill=palette(), bty="n") +dev.off() + +# expression value distribution +par(mar=c(4,4,2,1)) +title <- paste ("GSE20347", "/", annotation(gset), " value distribution", sep ="") +plotDensities(ex, group=gs, main=title, legend ="topright") + +# UMAP plot (dimensionality reduction) +ex <- na.omit(ex) # eliminate rows with NAs +ex <- ex[!duplicated(ex), ] # remove duplicates +ump <- umap(t(ex), n_neighbors = 14, random_state = 123) +par(mar=c(3,3,2,6), xpd=TRUE) +plot(ump$layout, main="UMAP plot, nbrs=14", xlab="", ylab="", col=gs, pch=20, cex=1.5) +legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20, + col=1:nlevels(gs), title="Group", pt.cex=1.5) +library("maptools") # point labels without overlaps +pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6) + +# mean-variance trend, helps to see if precision weights are needed +plotSA(fit2, main="Mean variance trend, GSE20347") diff --git a/DGE/Limma/mRNA/GSE29001.R b/DGE/Limma/mRNA/GSE29001.R new file mode 100644 index 0000000000000000000000000000000000000000..c08bfe481bc452bf6ff04a285a960bb1c59d7a3d --- /dev/null +++ b/DGE/Limma/mRNA/GSE29001.R @@ -0,0 +1,113 @@ +# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0 +################################################################ +# Differential expression analysis with limma +library(GEOquery) +library(limma) +library(umap) + +# load series and platform data from GEO + +gset <- getGEO("GSE29001", GSEMatrix =TRUE, AnnotGPL=TRUE) +if (length(gset) > 1) idx <- grep("GPL571", attr(gset, "names")) else idx <- 1 +gset <- gset[[idx]] + +# make proper column names to match toptable +fvarLabels(gset) <- make.names(fvarLabels(gset)) + +# group membership for all samples +gsms <- "001001100110011001100100110011001100110011001" +sml <- strsplit(gsms, split="")[[1]] + +# log2 transformation +ex <- exprs(gset) +qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) +LogC <- (qx[5] > 100) || + (qx[6]-qx[1] > 50 && qx[2] > 0) +if (LogC) { ex[which(ex <= 0)] <- NaN +exprs(gset) <- log2(ex) } + +# assign samples to groups and set up design matrix +gs <- factor(sml) +groups <- make.names(c("normal","cancer")) +levels(gs) <- groups +gset$group <- gs +design <- model.matrix(~group + 0, gset) +colnames(design) <- levels(gs) + +gset <- gset[complete.cases(exprs(gset)), ] # skip missing values + +fit <- lmFit(gset, design) # fit linear model + +# set up contrasts of interest and recalculate model coefficients +cts <- c(paste(groups[1],"-",groups[2],sep="")) +cont.matrix <- makeContrasts(contrasts=cts, levels=design) +fit2 <- contrasts.fit(fit, cont.matrix) + +# compute statistics and table of top significant genes +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) + +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","GB_ACC","SPOT_ID","Gene.Symbol","Gene.symbol","Gene.title")) +write.table(tT, file=stdout(), row.names=F, sep="\t") + +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", + ylab = "Number of genes", main = "P-adj value distribution") + +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1) + +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) + +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") + +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, + highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) + +# MD plot (log fold change vs mean log expression) +# highlight statistically significant (p-adj < 0.05) probes +plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1) +abline(h=0) + +################################################################ +# General expression data analysis +ex <- exprs(gset) + +# box-and-whisker plot +dev.new(width=3+ncol(gset)/6, height=5) +ord <- order(gs) # order samples by group +palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02", + "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666")) +par(mar=c(7,4,2,1)) +title <- paste ("GSE29001", "/", annotation(gset), sep ="") +boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord]) +legend("topleft", groups, fill=palette(), bty="n") +dev.off() + +# expression value distribution +par(mar=c(4,4,2,1)) +title <- paste ("GSE29001", "/", annotation(gset), " value distribution", sep ="") +plotDensities(ex, group=gs, main=title, legend ="topright") + +# UMAP plot (dimensionality reduction) +ex <- na.omit(ex) # eliminate rows with NAs +ex <- ex[!duplicated(ex), ] # remove duplicates +ump <- umap(t(ex), n_neighbors = 15, random_state = 123) +par(mar=c(3,3,2,6), xpd=TRUE) +plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5) +legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20, + col=1:nlevels(gs), title="Group", pt.cex=1.5) +library("maptools") # point labels without overlaps +pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6) + +# mean-variance trend, helps to see if precision weights are needed +plotSA(fit2, main="Mean variance trend, GSE29001") diff --git a/DGE/Limma/mRNA/GSE70409.R b/DGE/Limma/mRNA/GSE70409.R new file mode 100644 index 0000000000000000000000000000000000000000..d0571bc2322d9005c0dd72ead9a98136e55a78ad --- /dev/null +++ b/DGE/Limma/mRNA/GSE70409.R @@ -0,0 +1,113 @@ +# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0 +################################################################ +# Differential expression analysis with limma +library(GEOquery) +library(limma) +library(umap) + +# load series and platform data from GEO + +gset <- getGEO("GSE70409", GSEMatrix =TRUE, AnnotGPL=FALSE) +if (length(gset) > 1) idx <- grep("GPL13287", attr(gset, "names")) else idx <- 1 +gset <- gset[[idx]] + +# make proper column names to match toptable +fvarLabels(gset) <- make.names(fvarLabels(gset)) + +# group membership for all samples +gsms <- "0101010101010101010101010101011111" +sml <- strsplit(gsms, split="")[[1]] + +# log2 transformation +ex <- exprs(gset) +qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) +LogC <- (qx[5] > 100) || + (qx[6]-qx[1] > 50 && qx[2] > 0) +if (LogC) { ex[which(ex <= 0)] <- NaN +exprs(gset) <- log2(ex) } + +# assign samples to groups and set up design matrix +gs <- factor(sml) +groups <- make.names(c("normal","cancer")) +levels(gs) <- groups +gset$group <- gs +design <- model.matrix(~group + 0, gset) +colnames(design) <- levels(gs) + +gset <- gset[complete.cases(exprs(gset)), ] # skip missing values + +fit <- lmFit(gset, design) # fit linear model + +# set up contrasts of interest and recalculate model coefficients +cts <- c(paste(groups[1],"-",groups[2],sep="")) +cont.matrix <- makeContrasts(contrasts=cts, levels=design) +fit2 <- contrasts.fit(fit, cont.matrix) + +# compute statistics and table of top significant genes +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) + +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","GB_ACC","SEQUENCE","SPOT_ID")) +write.table(tT, file=stdout(), row.names=F, sep="\t") + +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", + ylab = "Number of genes", main = "P-adj value distribution") + +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1) + +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) + +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") + +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, + highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) + +# MD plot (log fold change vs mean log expression) +# highlight statistically significant (p-adj < 0.05) probes +plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1) +abline(h=0) + +################################################################ +# General expression data analysis +ex <- exprs(gset) + +# box-and-whisker plot +dev.new(width=3+ncol(gset)/6, height=5) +ord <- order(gs) # order samples by group +palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02", + "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666")) +par(mar=c(7,4,2,1)) +title <- paste ("GSE70409", "/", annotation(gset), sep ="") +boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord]) +legend("topleft", groups, fill=palette(), bty="n") +dev.off() + +# expression value distribution +par(mar=c(4,4,2,1)) +title <- paste ("GSE70409", "/", annotation(gset), " value distribution", sep ="") +plotDensities(ex, group=gs, main=title, legend ="topright") + +# UMAP plot (dimensionality reduction) +ex <- na.omit(ex) # eliminate rows with NAs +ex <- ex[!duplicated(ex), ] # remove duplicates +ump <- umap(t(ex), n_neighbors = 14, random_state = 123) +par(mar=c(3,3,2,6), xpd=TRUE) +plot(ump$layout, main="UMAP plot, nbrs=14", xlab="", ylab="", col=gs, pch=20, cex=1.5) +legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20, + col=1:nlevels(gs), title="Group", pt.cex=1.5) +library("maptools") # point labels without overlaps +pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6) + +# mean-variance trend, helps to see if precision weights are needed +plotSA(fit2, main="Mean variance trend, GSE70409") \ No newline at end of file diff --git a/DGE/Limma/miRNA/.Rhistory b/DGE/Limma/miRNA/.Rhistory new file mode 100644 index 0000000000000000000000000000000000000000..e03d32136de402b075be7dd28946957cb28d8937 --- /dev/null +++ b/DGE/Limma/miRNA/.Rhistory @@ -0,0 +1,512 @@ +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID")) +write.table(tT, file=stdout(), row.names=F, sep="\t") +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", +ylab = "Number of genes", main = "P-adj value distribution") +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1) +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, +highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) +# MD plot (log fold change vs mean log expression) +# highlight statistically significant (p-adj < 0.05) probes +plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1) +abline(h=0) +################################################################ +# General expression data analysis +ex <- exprs(gset) +# box-and-whisker plot +dev.new(width=3+ncol(gset)/6, height=5) +ord <- order(gs) # order samples by group +palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02", +"#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666")) +par(mar=c(7,4,2,1)) +title <- paste ("GSE43732", "/", annotation(gset), sep ="") +boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord]) +legend("topleft", groups, fill=palette(), bty="n") +dev.off() +# expression value distribution +par(mar=c(4,4,2,1)) +title <- paste ("GSE43732", "/", annotation(gset), " value distribution", sep ="") +plotDensities(ex, group=gs, main=title, legend ="topright") +# UMAP plot (dimensionality reduction) +ex <- na.omit(ex) # eliminate rows with NAs +ex <- ex[!duplicated(ex), ] # remove duplicates +ump <- umap(t(ex), n_neighbors = 15, random_state = 123) +par(mar=c(3,3,2,6), xpd=TRUE) +plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5) +legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20, +col=1:nlevels(gs), title="Group", pt.cex=1.5) +library("maptools") # point labels without overlaps +pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6) +# mean-variance trend, helps to see if precision weights are needed +plotSA(fit2, main="Mean variance trend, GSE43732") +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, +highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) +ggplot(fit2, aes(x = logFC, y = -log10p)) + +geom_point(color = "grey", alpha = 0.8) + +geom_point(data = subset(volcano_df, logFC > logFC_threshold & -log10p > p_threshold), +color = "red", alpha = 0.8) + +xlim(c(-max(abs(fold_changes)), max(abs(fold_changes)))) + +ylim(c(0, max(-log10(p_values)))) + +labs(x = "Log2 Fold Change", y = "-log10 p-value", title = "Volcano Plot") + +theme_minimal() +# Example code +library(ggplot) +# Example code +library(ggplot2) +ggplot(fit2, aes(x = logFC, y = -log10p)) + +geom_point(color = "grey", alpha = 0.8) + +geom_point(data = subset(volcano_df, logFC > logFC_threshold & -log10p > p_threshold), +color = "red", alpha = 0.8) + +xlim(c(-max(abs(fold_changes)), max(abs(fold_changes)))) + +ylim(c(0, max(-log10(p_values)))) + +labs(x = "Log2 Fold Change", y = "-log10 p-value", title = "Volcano Plot") + +theme_minimal() +library(GEOquery) +library(tidyverse) +library(magrittr) +library(multiMiR) +library(writexl) +miRNA_GSE43732 <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=TRUE,getGPL= T) +if (length(miRNA_GSE43732) > 1) idx <- grep("GPL4508", attr(miRNA_GSE43732, "names")) else idx <- 1 +miRNA_GSE43732 <- miRNA_GSE43732[[idx]] +GSE43732 <- miRNA_GSE43732@assayData[["exprs"]] +colx<- miRNA_GSE43732@featureData@data[["Name"]] +GSE43732 <- cbind(colx,GSE43732) +# Convert matrix to a dataframe +df_GSE43732 <- as.data.frame(GSE43732) +library(DESeq2) +# Extract only the count matrix +count_matrix <- df_GSE43732[, 1:ncol(df_GSE43732)] +# Sample information +sample_names <- colnames(count_matrix) +conditions = c("cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "normal", "normal", "normal", +"cancer", "cancer", "cancer", "cancer", "normal", "normal", "normal", "normal") +sample_info <- data.frame(Sample = sample_names, condition = conditions) +#### SKIRMISH FOR DESEQ2 DATASET ########### +#make NA's to 0 +count_matrix <- as.matrix(sapply(count_matrix, as.numeric)) +count_matrix[is.na(count_matrix)] <- 0 +count_matrix <- round(count_matrix) +#get rownames of previous data +row_names_GSE43732 <- rownames(GSE43732) +# Set gene_id to rownames again +rownames(count_matrix) <- row_names_GSE43732 +#negative values to abs() --> simulate counts +count_matrix <- abs(count_matrix) +# Create the DESeqDataSet using the updated sample_info +dds <- DESeqDataSetFromMatrix(countData = count_matrix, +colData = sample_info, +design = ~ condition) +# Perform differential gene expression analysis +dds <- DESeq(dds) +resultsNames(dds) +par(mar=c(8,5,2,2)) +boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2) +res <- results(dds, name="condition_normal_vs_cancer") +summary(res) +# Adjustment to overcome NA-error --> exclude rows containing NA's +filtered_indices <- which(res$log2FoldChange > 1 & res$padj < 0.05 & !is.na(res$padj)) +filtered_downregulated_genes <- res[filtered_indices, ] +gene_names <- rownames(filtered_downregulated_genes) +gene_names +results(dds) +results <- results(dds) +# Calculate the log2 fold change and -log10 adjusted p-values +results$log2FoldChange <- with(results, ifelse(baseMean > 0, log2FoldChange, -log2FoldChange)) +results$negLog10padj <- -log10(results$padj) +# Plot the volcano plot +plot(results$log2FoldChange, results$negLog10padj, +xlim = c(-max(abs(results$log2FoldChange)), max(abs(results$log2FoldChange)) + 1), +ylim = c(0, max(results$negLog10padj) + 1), +xlab = "Log2 Fold Change", +ylab = "-log10(Adjusted p-value)", +main = "Volcano Plot") +# Calculate the log2 fold change and -log10 adjusted p-values +results$log2FoldChange <- with(results, ifelse(baseMean > 0, log2FoldChange, -log2FoldChange)) +results$negLog10padj <- -log10(results$padj) +# Determine the range of log2 fold change values +logFC_range <- range(results$log2FoldChange, na.rm = TRUE) +# Plot the volcano plot +plot(results$log2FoldChange, results$negLog10padj, +xlim = c(logFC_range[1] - 1, logFC_range[2] + 1), +ylim = c(0, max(results$negLog10padj) + 1), +xlab = "Log2 Fold Change", +ylab = "-log10(Adjusted p-value)", +main = "Volcano Plot") +# Calculate the log2 fold change and -log10 adjusted p-values +results$log2FoldChange <- with(results, ifelse(baseMean > 0, log2FoldChange, -log2FoldChange)) +results$negLog10padj <- -log10(results$padj) +# Determine the range of -log10 adjusted p-values +logP_range <- range(results$negLog10padj, na.rm = TRUE) +# Plot the volcano plot +plot(results$log2FoldChange, results$negLog10padj, +xlim = c(min(results$log2FoldChange), max(results$log2FoldChange)), +ylim = c(logP_range[1] - 1, logP_range[2] + 1), +xlab = "Log2 Fold Change", +ylab = "-log10(Adjusted p-value)", +main = "Volcano Plot") +# Add points for significantly differentially expressed genes +significant_genes <- subset(results, padj < 0.05) +points(significant_genes$log2FoldChange, significant_genes$negLog10padj, +col = "red", pch = 16) +# Calculate the log2 fold change and -log10 adjusted p-values +results$log2FoldChange <- with(results, ifelse(baseMean > 0, log2FoldChange, -log2FoldChange)) +results$negLog10padj <- -log10(results$padj) +ggplot(res_tableOE_tb, aes(x = log2FoldChange, y = -log10(pvalue))) + +geom_point(aes(colour = threshold_OE)) + +geom_text_repel(aes(label = genelabels)) + +xlab("log2 fold change") + +ylab("-log10 p-value") + +theme(legend.position = c(0.8, 0.2), +plot.title = element_text(size = rel(1.5), hjust = 0.5), +axis.title = element_text(size = rel(1.25))) +ggplot(results, aes(x = log2FoldChange, y = -log10(pvalue))) + +geom_point(aes(colour = threshold_OE)) + +geom_text_repel(aes(label = genelabels)) + +xlab("log2 fold change") + +ylab("-log10 p-value") + +theme(legend.position = c(0.8, 0.2), +plot.title = element_text(size = rel(1.5), hjust = 0.5), +axis.title = element_text(size = rel(1.25))) +View(results) +library(GEOquery) +library(limma) +library(umap) +gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE) +if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1 +gset <- gset[[idx]] +# make proper column names to match toptable +fvarLabels(gset) <- make.names(fvarLabels(gset)) +# group membership for all samples +gsms <- paste0("11110000111100001111000011110000111100001111000011", +"11000011110000111100001111000011110000111100001111", +"00001111000011110000111100000111100011110000111100", +"00111100001011110000111100001111000011110000111100", +"00111000111100001111000011100011110000") +sml <- strsplit(gsms, split="")[[1]] +# log2 transformation +ex <- exprs(gset) +qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) +LogC <- (qx[5] > 100) || +(qx[6]-qx[1] > 50 && qx[2] > 0) +if (LogC) { ex[which(ex <= 0)] <- NaN +exprs(gset) <- log2(ex) } +# assign samples to groups and set up design matrix +gs <- factor(sml) +groups <- make.names(c("Normal","Cancer")) +levels(gs) <- groups +gset$group <- gs +design <- model.matrix(~group + 0, gset) +colnames(design) <- levels(gs) +gset <- gset[complete.cases(exprs(gset)), ] # skip missing values +fit <- lmFit(gset, design) # fit linear model +# set up contrasts of interest and recalculate model coefficients +cts <- paste(groups[1], groups[2], sep="-") +cont.matrix <- makeContrasts(contrasts=cts, levels=design) +fit2 <- contrasts.fit(fit, cont.matrix) +# compute statistics and table of top significant genes +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID")) +write.table(tT, file=stdout(), row.names=F, sep="\t") +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", +ylab = "Number of genes", main = "P-adj value distribution") +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1) +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, +highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) +# MD plot (log fold change vs mean log expression) +# highlight statistically significant (p-adj < 0.05) probes +plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1) +abline(h=0) +library(GEOquery) +library(limma) +library(umap) +gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE) +if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1 +gset <- gset[[idx]] +# make proper column names to match toptable +fvarLabels(gset) <- make.names(fvarLabels(gset)) +# group membership for all samples +gsms <- paste0("11110000111100001111000011110000111100001111000011", +"11000011110000111100001111000011110000111100001111", +"00001111000011110000111100000111100011110000111100", +"00111100001011110000111100001111000011110000111100", +"00111000111100001111000011100011110000") +sml <- strsplit(gsms, split="")[[1]] +# log2 transformation +ex <- exprs(gset) +qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) +LogC <- (qx[5] > 100) || +(qx[6]-qx[1] > 50 && qx[2] > 0) +if (LogC) { ex[which(ex <= 0)] <- NaN +exprs(gset) <- log2(ex) } +# assign samples to groups and set up design matrix +gs <- factor(sml) +groups <- make.names(c("Normal","Cancer")) +levels(gs) <- groups +gset$group <- gs +design <- model.matrix(~group + 0, gset) +colnames(design) <- levels(gs) +gset <- gset[complete.cases(exprs(gset)), ] # skip missing values +fit <- lmFit(gset, design) # fit linear model +# set up contrasts of interest and recalculate model coefficients +cts <- paste(groups[1], groups[2], sep="-") +cont.matrix <- makeContrasts(contrasts=cts, levels=design) +fit2 <- contrasts.fit(fit, cont.matrix) +# compute statistics and table of top significant genes +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID")) +write.table(tT, file=stdout(), row.names=F, sep="\t") +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", +ylab = "Number of genes", main = "P-adj value distribution") +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1) +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") +library(GEOquery) +library(limma) +library(umap) +gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE) +if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1 +gset <- gset[[idx]] +# make proper column names to match toptable +fvarLabels(gset) <- make.names(fvarLabels(gset)) +# group membership for all samples +gsms <- paste0("11110000111100001111000011110000111100001111000011", +"11000011110000111100001111000011110000111100001111", +"00001111000011110000111100000111100011110000111100", +"00111100001011110000111100001111000011110000111100", +"00111000111100001111000011100011110000") +sml <- strsplit(gsms, split="")[[1]] +# log2 transformation +ex <- exprs(gset) +qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) +LogC <- (qx[5] > 100) || +(qx[6]-qx[1] > 50 && qx[2] > 0) +if (LogC) { ex[which(ex <= 0)] <- NaN +exprs(gset) <- log2(ex) } +# assign samples to groups and set up design matrix +gs <- factor(sml) +groups <- make.names(c("Normal","Cancer")) +levels(gs) <- groups +gset$group <- gs +design <- model.matrix(~group + 0, gset) +colnames(design) <- levels(gs) +gset <- gset[complete.cases(exprs(gset)), ] # skip missing values +fit <- lmFit(gset, design) # fit linear model +# set up contrasts of interest and recalculate model coefficients +cts <- paste(groups[1], groups[2], sep="-") +cont.matrix <- makeContrasts(contrasts=cts, levels=design) +fit2 <- contrasts.fit(fit, cont.matrix) +# compute statistics and table of top significant genes +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID")) +write.table(tT, file=stdout(), row.names=F, sep="\t") +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", +ylab = "Number of genes", main = "P-adj value distribution") +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1) +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, +highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) +library(GEOquery) +library(limma) +library(umap) +gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE) +if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1 +gset <- gset[[idx]] +# make proper column names to match toptable +fvarLabels(gset) <- make.names(fvarLabels(gset)) +# group membership for all samples +gsms <- paste0("11110000111100001111000011110000111100001111000011", +"11000011110000111100001111000011110000111100001111", +"00001111000011110000111100000111100011110000111100", +"00111100001011110000111100001111000011110000111100", +"00111000111100001111000011100011110000") +sml <- strsplit(gsms, split="")[[1]] +# log2 transformation +ex <- exprs(gset) +qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) +LogC <- (qx[5] > 100) || +(qx[6]-qx[1] > 50 && qx[2] > 0) +if (LogC) { ex[which(ex <= 0)] <- NaN +exprs(gset) <- log2(ex) } +# assign samples to groups and set up design matrix +gs <- factor(sml) +groups <- make.names(c("Normal","Cancer")) +levels(gs) <- groups +gset$group <- gs +design <- model.matrix(~group + 0, gset) +colnames(design) <- levels(gs) +gset <- gset[complete.cases(exprs(gset)), ] # skip missing values +fit <- lmFit(gset, design) # fit linear model +# set up contrasts of interest and recalculate model coefficients +cts <- paste(groups[1], groups[2], sep="-") +cont.matrix <- makeContrasts(contrasts=cts, levels=design) +fit2 <- contrasts.fit(fit, cont.matrix) +# compute statistics and table of top significant genes +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID")) +write.table(tT, file=stdout(), row.names=F, sep="\t") +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", +ylab = "Number of genes", main = "P-adj value distribution") +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1) +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, +highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) +# MD plot (log fold change vs mean log expression) +# highlight statistically significant (p-adj < 0.05) probes +plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1) +abline(h=0) +################################################################ +# General expression data analysis +ex <- exprs(gset) +# box-and-whisker plot +dev.new(width=3+ncol(gset)/6, height=5) +ord <- order(gs) # order samples by group +palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02", +"#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666")) +par(mar=c(7,4,2,1)) +title <- paste ("GSE43732", "/", annotation(gset), sep ="") +boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord]) +legend("topleft", groups, fill=palette(), bty="n") +dev.off() +# expression value distribution +par(mar=c(4,4,2,1)) +title <- paste ("GSE43732", "/", annotation(gset), " value distribution", sep ="") +plotDensities(ex, group=gs, main=title, legend ="topright") +# UMAP plot (dimensionality reduction) +ex <- na.omit(ex) # eliminate rows with NAs +ex <- ex[!duplicated(ex), ] # remove duplicates +ump <- umap(t(ex), n_neighbors = 15, random_state = 123) +par(mar=c(3,3,2,6), xpd=TRUE) +plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5) +legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20, +col=1:nlevels(gs), title="Group", pt.cex=1.5) +library("maptools") # point labels without overlaps +pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6) +# mean-variance trend, helps to see if precision weights are needed +plotSA(fit2, main="Mean variance trend, GSE43732") +mirnames <- read.csv("C:/Users/Emre/Desktop/miRNAs.csv") +mirnames <- read.csv("C:/Users/Emre/Desktop/miRNAs.xlsx") +View(mirnames) +mirnames <- read.csv("C:/Users/Emre/Desktop/miRNAs.xlsx", header = TRUE) +library(multiMiR) +library(readxl) +mirnames <- read_excel("C:/Users/Emre/Desktop/miRNAs.xlsx", header = TRUE) +mirnames <- read_excel("C:/Users/Emre/Desktop/miRNAs.xlsx") +View(mirnames) +library(multiMiR) +multimir_results <- get_multimir(org = 'hsa', +mirna = mirnames, +table = 'validated', +summary = TRUE) +head(multimir_results@data) +targets <- multimir_results@data +excel_file <- "C:/Users/Emre/Desktop/multimir_results_cyto.xlsx" +write_xlsx(targets, path = excel_file ) +library("xlsx") +library(xlsx) +install.packages(xlsx) +install.packages("xlsx") +library(xlsx) +write_xlsx(targets, path = excel_file ) +targets <- multimir_results@data +excel_file <- ("C:/Users/Emre/Desktop/multimir_results_cyto.xlsx") +library(xlsx) +write_xlsx(targets, path = excel_file ) +write_xlsx(targets, path = excel_file ) +library(writexl) +write_xlsx(targets, path = excel_file ) diff --git a/DGE/Limma/miRNA/GSE43732.R b/DGE/Limma/miRNA/GSE43732.R new file mode 100644 index 0000000000000000000000000000000000000000..b30697e99ef34015d25767f64f2c3eaf432e0a40 --- /dev/null +++ b/DGE/Limma/miRNA/GSE43732.R @@ -0,0 +1,114 @@ +library(GEOquery) +library(limma) +library(umap) + +# load series and platform data from GEO + +gset <- getGEO("GSE43732", GSEMatrix =TRUE, AnnotGPL=FALSE) +if (length(gset) > 1) idx <- grep("GPL16543", attr(gset, "names")) else idx <- 1 +gset <- gset[[idx]] + +# make proper column names to match toptable +fvarLabels(gset) <- make.names(fvarLabels(gset)) + +# group membership for all samples +gsms <- paste0("11110000111100001111000011110000111100001111000011", + "11000011110000111100001111000011110000111100001111", + "00001111000011110000111100000111100011110000111100", + "00111100001011110000111100001111000011110000111100", + "00111000111100001111000011100011110000") +sml <- strsplit(gsms, split="")[[1]] + +# log2 transformation +ex <- exprs(gset) +qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) +LogC <- (qx[5] > 100) || + (qx[6]-qx[1] > 50 && qx[2] > 0) +if (LogC) { ex[which(ex <= 0)] <- NaN + exprs(gset) <- log2(ex) } + +# assign samples to groups and set up design matrix +gs <- factor(sml) +groups <- make.names(c("Normal","Cancer")) +levels(gs) <- groups +gset$group <- gs +design <- model.matrix(~group + 0, gset) +colnames(design) <- levels(gs) + +gset <- gset[complete.cases(exprs(gset)), ] # skip missing values + +fit <- lmFit(gset, design) # fit linear model + +# set up contrasts of interest and recalculate model coefficients +cts <- paste(groups[1], groups[2], sep="-") +cont.matrix <- makeContrasts(contrasts=cts, levels=design) +fit2 <- contrasts.fit(fit, cont.matrix) + +# compute statistics and table of top significant genes +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) + +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","SPOT_ID","miRNA_ID")) +write.table(tT, file=stdout(), row.names=F, sep="\t") + +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", + ylab = "Number of genes", main = "P-adj value distribution") + +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1) + +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) + +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") + +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, + highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) + +# MD plot (log fold change vs mean log expression) +# highlight statistically significant (p-adj < 0.05) probes +plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1) +abline(h=0) + +################################################################ +# General expression data analysis +ex <- exprs(gset) + +# box-and-whisker plot +dev.new(width=3+ncol(gset)/6, height=5) +ord <- order(gs) # order samples by group +palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02", + "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666")) +par(mar=c(7,4,2,1)) +title <- paste ("GSE43732", "/", annotation(gset), sep ="") +boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord]) +legend("topleft", groups, fill=palette(), bty="n") +dev.off() + +# expression value distribution +par(mar=c(4,4,2,1)) +title <- paste ("GSE43732", "/", annotation(gset), " value distribution", sep ="") +plotDensities(ex, group=gs, main=title, legend ="topright") + +# UMAP plot (dimensionality reduction) +ex <- na.omit(ex) # eliminate rows with NAs +ex <- ex[!duplicated(ex), ] # remove duplicates +ump <- umap(t(ex), n_neighbors = 15, random_state = 123) +par(mar=c(3,3,2,6), xpd=TRUE) +plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5) +legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20, +col=1:nlevels(gs), title="Group", pt.cex=1.5) +library("maptools") # point labels without overlaps +pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6) + +# mean-variance trend, helps to see if precision weights are needed +plotSA(fit2, main="Mean variance trend, GSE43732") \ No newline at end of file diff --git a/DGE/Limma/miRNA/GSE55857.R b/DGE/Limma/miRNA/GSE55857.R new file mode 100644 index 0000000000000000000000000000000000000000..c16aa4f6ffa192adbe2d3737c7f402a9669fd1d4 --- /dev/null +++ b/DGE/Limma/miRNA/GSE55857.R @@ -0,0 +1,117 @@ +# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0 +################################################################ +# Differential expression analysis with limma +library(GEOquery) +library(limma) +library(umap) + +# load series and platform data from GEO + +gset <- getGEO("GSE55857", GSEMatrix =TRUE, AnnotGPL=FALSE) +if (length(gset) > 1) idx <- grep("GPL14613", attr(gset, "names")) else idx <- 1 +gset <- gset[[idx]] + +# make proper column names to match toptable +fvarLabels(gset) <- make.names(fvarLabels(gset)) + +# group membership for all samples +gsms <- paste0("10101010101010101010101010101010101010101010101010", + "10101010101010101010101010101010101010101010101010", + "10101010101010101010101010101010101010101010101010", + "10101010101010101010101010101010101010101010101010", + "1010101010101010") +sml <- strsplit(gsms, split="")[[1]] + +# log2 transformation +ex <- exprs(gset) +qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) +LogC <- (qx[5] > 100) || + (qx[6]-qx[1] > 50 && qx[2] > 0) +if (LogC) { ex[which(ex <= 0)] <- NaN +exprs(gset) <- log2(ex) } + +# assign samples to groups and set up design matrix +gs <- factor(sml) +groups <- make.names(c("cancer","normal")) +levels(gs) <- groups +gset$group <- gs +design <- model.matrix(~group + 0, gset) +colnames(design) <- levels(gs) + +gset <- gset[complete.cases(exprs(gset)), ] # skip missing values + +fit <- lmFit(gset, design) # fit linear model + +# set up contrasts of interest and recalculate model coefficients +cts <- c(paste(groups[1],"-",groups[2],sep="")) +cont.matrix <- makeContrasts(contrasts=cts, levels=design) +fit2 <- contrasts.fit(fit, cont.matrix) + +# compute statistics and table of top significant genes +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) + +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","miRNA_ID_LIST","SPOT_ID")) +write.table(tT, file=stdout(), row.names=F, sep="\t") + +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", + ylab = "Number of genes", main = "P-adj value distribution") + +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1) + +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) + +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") + +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, + highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) + +# MD plot (log fold change vs mean log expression) +# highlight statistically significant (p-adj < 0.05) probes +plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1) +abline(h=0) + +################################################################ +# General expression data analysis +ex <- exprs(gset) + +# box-and-whisker plot +dev.new(width=3+ncol(gset)/6, height=5) +ord <- order(gs) # order samples by group +palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02", + "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666")) +par(mar=c(7,4,2,1)) +title <- paste ("GSE55857", "/", annotation(gset), sep ="") +boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord]) +legend("topleft", groups, fill=palette(), bty="n") +dev.off() + +# expression value distribution +par(mar=c(4,4,2,1)) +title <- paste ("GSE55857", "/", annotation(gset), " value distribution", sep ="") +plotDensities(ex, group=gs, main=title, legend ="topright") + +# UMAP plot (dimensionality reduction) +ex <- na.omit(ex) # eliminate rows with NAs +ex <- ex[!duplicated(ex), ] # remove duplicates +ump <- umap(t(ex), n_neighbors = 15, random_state = 123) +par(mar=c(3,3,2,6), xpd=TRUE) +plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5) +legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20, + col=1:nlevels(gs), title="Group", pt.cex=1.5) +library("maptools") # point labels without overlaps +pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6) + +# mean-variance trend, helps to see if precision weights are needed +plotSA(fit2, main="Mean variance trend, GSE55857") \ No newline at end of file diff --git a/DGE/Limma/miRNA/GSE6188.R b/DGE/Limma/miRNA/GSE6188.R new file mode 100644 index 0000000000000000000000000000000000000000..0fa5a582fdd4f4f4bf6138ea522a67cbebbebc13 --- /dev/null +++ b/DGE/Limma/miRNA/GSE6188.R @@ -0,0 +1,118 @@ +# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0 +################################################################ +# Differential expression analysis with limma +library(GEOquery) +library(limma) +library(umap) + +# load series and platform data from GEO + +gset <- getGEO("GSE6188", GSEMatrix =TRUE, AnnotGPL=FALSE) +if (length(gset) > 1) idx <- grep("GPL4508", attr(gset, "names")) else idx <- 1 +gset <- gset[[idx]] + +# make proper column names to match toptable +fvarLabels(gset) <- make.names(fvarLabels(gset)) + +# group membership for all samples +gsms <- paste0("00000011111100110000001111110000000000000011111111", + "11110000000000111111111100000000000000111111111111", + "11000111100011111110011001100110011001100000011110", + "01100110101101001101100110011001111001100110011001", + "10011001100111111111111111111111111111111111111111", + "1111111") +sml <- strsplit(gsms, split="")[[1]] + +# log2 transformation +ex <- exprs(gset) +qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T)) +LogC <- (qx[5] > 100) || + (qx[6]-qx[1] > 50 && qx[2] > 0) +if (LogC) { ex[which(ex <= 0)] <- NaN +exprs(gset) <- log2(ex) } + +# assign samples to groups and set up design matrix +gs <- factor(sml) +groups <- make.names(c("Normal","Cancer")) +levels(gs) <- groups +gset$group <- gs +design <- model.matrix(~group + 0, gset) +colnames(design) <- levels(gs) + +gset <- gset[complete.cases(exprs(gset)), ] # skip missing values + +fit <- lmFit(gset, design) # fit linear model + +# set up contrasts of interest and recalculate model coefficients +cts <- c(paste(groups[1],"-",groups[2],sep="")) +cont.matrix <- makeContrasts(contrasts=cts, levels=design) +fit2 <- contrasts.fit(fit, cont.matrix) + +# compute statistics and table of top significant genes +fit2 <- eBayes(fit2, 0.01) +tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250) + +tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","ORF","SPOT_ID")) +write.table(tT, file=stdout(), row.names=F, sep="\t") + +# Visualize and quality control test results. +# Build histogram of P-values for all genes. Normal test +# assumption is that most genes are not differentially expressed. +tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf) +hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj", + ylab = "Number of genes", main = "P-adj value distribution") + +# summarize test results as "up", "down" or "not expressed" +dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=1) + +# Venn diagram of results +vennDiagram(dT, circle.col=palette()) + +# create Q-Q plot for t-statistic +t.good <- which(!is.na(fit2$F)) # filter out bad probes +qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic") + +# volcano plot (log P-value vs log fold change) +colnames(fit2) # list contrast names +ct <- 1 # choose contrast of interest +volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20, + highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2))) + +# MD plot (log fold change vs mean log expression) +# highlight statistically significant (p-adj < 0.05) probes +plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1) +abline(h=0) + +################################################################ +# General expression data analysis +ex <- exprs(gset) + +# box-and-whisker plot +dev.new(width=3+ncol(gset)/6, height=5) +ord <- order(gs) # order samples by group +palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02", + "#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666")) +par(mar=c(7,4,2,1)) +title <- paste ("GSE6188", "/", annotation(gset), sep ="") +boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord]) +legend("topleft", groups, fill=palette(), bty="n") +dev.off() + +# expression value distribution +par(mar=c(4,4,2,1)) +title <- paste ("GSE6188", "/", annotation(gset), " value distribution", sep ="") +plotDensities(ex, group=gs, main=title, legend ="topright") + +# UMAP plot (dimensionality reduction) +ex <- na.omit(ex) # eliminate rows with NAs +ex <- ex[!duplicated(ex), ] # remove duplicates +ump <- umap(t(ex), n_neighbors = 15, random_state = 123) +par(mar=c(3,3,2,6), xpd=TRUE) +plot(ump$layout, main="UMAP plot, nbrs=15", xlab="", ylab="", col=gs, pch=20, cex=1.5) +legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20, + col=1:nlevels(gs), title="Group", pt.cex=1.5) +library("maptools") # point labels without overlaps +pointLabel(ump$layout, labels = rownames(ump$layout), method="SANN", cex=0.6) + +# mean-variance trend, helps to see if precision weights are needed +plotSA(fit2, main="Mean variance trend, GSE6188") \ No newline at end of file diff --git a/Data_Annotation/GO_enrichment/GO_molecular_function.png b/Data_Annotation/GO_enrichment/GO_molecular_function.png new file mode 100644 index 0000000000000000000000000000000000000000..98a9cda956855889c0f17b5c045c7f6dc7f66566 Binary files /dev/null and b/Data_Annotation/GO_enrichment/GO_molecular_function.png differ diff --git a/MultiMIR/multimir_results.xlsx b/MultiMIR/multimir_results.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..4efaed048d3a9e42642d670fc34f188419f4bafb Binary files /dev/null and b/MultiMIR/multimir_results.xlsx differ diff --git a/MultiMIR/multimir_results_final.xlsx b/MultiMIR/multimir_results_final.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..848982cc118e450f41332c9cd21caec14050bde5 Binary files /dev/null and b/MultiMIR/multimir_results_final.xlsx differ diff --git a/project.PNG b/project.PNG new file mode 100644 index 0000000000000000000000000000000000000000..5468604b154878df572abaf37dcb9746783c6853 Binary files /dev/null and b/project.PNG differ