diff --git a/code/cnv_analysis_PGC.Rmd b/code/cnv_analysis_PGC.Rmd index 7a9d8a3769e91535b2fe3b34a126915bfb9b6e18..7f81152d811b195a415e27886826263235da0d28 100644 --- a/code/cnv_analysis_PGC.Rmd +++ b/code/cnv_analysis_PGC.Rmd @@ -128,15 +128,56 @@ RecCNV[RecCNV[,"q.value"]==0, "q.value"] <- as.numeric(minval) RecCNV[,"score"] <- sapply(RecCNV[,"q.value"], function(x) -log10(as.numeric(x))) ``` +```{r} +library(biomaRt) +library(GenomicRanges) + +#Set up an gene annotation template to use +mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl") +mart <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org", path="/biomart/martservice", dataset="hsapiens_gene_ensembl") +genes <- getBM(attributes=c("hgnc_symbol","chromosome_name","start_position","end_position"), mart=mart) +genes <- genes[genes[,1]!="" & genes[,2] %in% c(1:22,"X","Y"),] +xidx <- which(genes[,2]=="X") +yidx <- which(genes[,2]=="Y") +genes[xidx, 2] <- 23 +genes[yidx, 2] <- 24 +genes[,2] <- sapply(genes[,2],as.integer) +genes <- genes[order(genes[,3]),] +genes <- genes[order(genes[,2]),] +colnames(genes) <- c("GeneSymbol","Chr","Start","End") +genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE) +``` + +```{r} +colnames(RecCNV) <- c("chr","aberation", "start", "end", "size", "q.value","score") +df_GR <- makeGRangesFromDataFrame(RecCNV, keep.extra.columns = TRUE) +hits <- findOverlaps(genes_GR, df_GR, type="within") +df_ann <- cbind(RecCNV[subjectHits(hits),],genes[queryHits(hits),]) +head(df_ann) +write.table(df_ann, file = "/data/gpfs-1/users/phgi10_c/work/Uni/MultiOmic_Datascience/results/cnv_analysis/final_cnv.csv", sep = "\t", quote = FALSE, row.names = FALSE) +``` +```{r} +AberrantRegion <- paste0(df_ann[,1],":", df_ann[,3],"-", df_ann[,4]) +GeneRegion <- paste0(df_ann[,7],":", df_ann[,8],"-", df_ann[,9]) +Final_regions <- cbind(df_ann[,c(6,2,5)], AberrantRegion, GeneRegion) +Final_regions[seq(1, nrow(Final_regions), 20),] +``` + +```{r} +hist(Final_regions$q.value) +``` +```{r} +final_cnv_set <- subset(Final_regions, q.value <= 0.01) +``` ```{r} #Create a function for plotting the recurrent copy number variants gaiaCNVplot <- function (calls, threshold=0.05, main="main") { - Calls <- calls[order(calls[,"Region.Start..bp."]),] - Calls <- Calls[order(Calls[,"Chromosome"]),] + Calls <- calls[order(calls[,"start"]),] + Calls <- Calls[order(Calls[,"chr"]),] rownames(Calls) <- NULL - Chromo <- Calls[,"Chromosome"] - Gains <- apply(Calls,1,function(x) ifelse(x["Aberration.Kind"]==1, x["score"], 0)) - Losses <- apply(Calls, 1,function(x) ifelse(x["Aberration.Kind"]==0, x["score"], 0)) + Chromo <- Calls[,"chr"] + Gains <- apply(Calls,1,function(x) ifelse(x["aberation"]==1, x["score"], 0)) + Losses <- apply(Calls, 1,function(x) ifelse(x["aberation"]==0, x["score"], 0)) plot(Gains, ylim=c(-max(Calls [,"score"]+2), max(Calls[,"score"]+2)), type="h", col="red2", xlab="Chromosome", ylab=expression("-log"[10]~italic(Q)~"value"), main=main, cex.main=4, xaxt="n", font=2, font.axis=2, font.lab=2, font.axis=2) points(-(Losses), type="h", col="forestgreen") abline(h= 0, cex=4)