GSE120706為含兩組(Mock和Hsv1)樣本的二代測序數據,平臺為GPL24247 Illumina NovaSeq 6000 (Mus musculus),請進行數據加載、清洗,探針ID轉換和數據可視化(包括差異分析、富集分析和互作分析)。
參考鏈接: https://cloud.tencent.com/developer/article/2206152
############################GEO,單樣本數據,RNA-seq差異表達分析代碼(GSE120706)####################### load("GSE120706.Rdata") rownames(data) = data$GeneID # 加載必要的包 library(edgeR)
## 載入需要的程序包:limma
library(DESeq2)
## 載入需要的程序包:S4Vectors
## 載入需要的程序包:stats4
## 載入需要的程序包:BiocGenerics
## ## 載入程序包:'BiocGenerics'
## The following object is masked from 'package:limma': ## ## plotMA
## The following objects are masked from 'package:stats': ## ## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base': ## ## anyDuplicated, aperm, append, as.data.frame, basename, cbind, ## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find, ## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply, ## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, ## Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff, ## table, tapply, union, unique, unsplit, which.max, which.min
## ## 載入程序包:'S4Vectors'
## The following object is masked from 'package:utils': ## ## findMatches
## The following objects are masked from 'package:base': ## ## expand.grid, I, unname
## 載入需要的程序包:IRanges
## ## 載入程序包:'IRanges'
## The following object is masked from 'package:grDevices': ## ## windows
## 載入需要的程序包:GenomicRanges
## 載入需要的程序包:GenomeInfoDb
## 載入需要的程序包:SummarizedExperiment
## 載入需要的程序包:MatrixGenerics
## 載入需要的程序包:matrixStats
## ## 載入程序包:'MatrixGenerics'
## The following objects are masked from 'package:matrixStats': ## ## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse, ## colCounts, colCummaxs, colCummins, colCumprods, colCumsums, ## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs, ## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats, ## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds, ## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads, ## colWeightedMeans, colWeightedMedians, colWeightedSds, ## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet, ## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods, ## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps, ## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins, ## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks, ## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars, ## rowWeightedMads, rowWeightedMeans, rowWeightedMedians, ## rowWeightedSds, rowWeightedVars
## 載入需要的程序包:Biobase
## Welcome to Bioconductor ## ## Vignettes contain introductory material; view with ## 'browseVignettes()'. To cite Bioconductor, see ## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## ## 載入程序包:'Biobase'
## The following object is masked from 'package:MatrixGenerics': ## ## rowMedians
## The following objects are masked from 'package:matrixStats': ## ## anyMissing, rowMedians
library(FactoMineR) library(factoextra)
## 載入需要的程序包:ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(clusterProfiler)
##
## clusterProfiler v4.12.6 Learn more at https://yulab-smu.top/contribution-knowledge-mining/ ## ## Please cite: ## ## G Yu. Thirteen years of clusterProfiler. The Innovation. 2024, ## 5(6):100722
## ## 載入程序包:'clusterProfiler'
## The following object is masked from 'package:IRanges': ## ## slice
## The following object is masked from 'package:S4Vectors': ## ## rename
## The following object is masked from 'package:stats': ## ## filter
library(org.Mm.eg.db)
## 載入需要的程序包:AnnotationDbi
## ## 載入程序包:'AnnotationDbi'
## The following object is masked from 'package:clusterProfiler': ## ## select
##
library(stringr) library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ── ## ? dplyr 1.1.4 ? readr 2.1.5 ## ? forcats 1.0.0 ? tibble 3.2.1 ## ? lubridate 1.9.4 ? tidyr 1.3.1 ## ? purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ── ## ? lubridate::%within%() masks IRanges::%within%() ## ? dplyr::collapse() masks IRanges::collapse() ## ? dplyr::combine() masks Biobase::combine(), BiocGenerics::combine() ## ? dplyr::count() masks matrixStats::count() ## ? dplyr::desc() masks IRanges::desc() ## ? tidyr::expand() masks S4Vectors::expand() ## ? dplyr::filter() masks clusterProfiler::filter(), stats::filter() ## ? dplyr::first() masks S4Vectors::first() ## ? dplyr::lag() masks stats::lag() ## ? ggplot2::Position() masks BiocGenerics::Position(), base::Position() ## ? purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce() ## ? dplyr::rename() masks clusterProfiler::rename(), S4Vectors::rename() ## ? lubridate::second() masks S4Vectors::second() ## ? lubridate::second<-() masks S4Vectors::second<-() ## ? dplyr::select() masks AnnotationDbi::select(), clusterProfiler::select() ## ? purrr::simplify() masks clusterProfiler::simplify() ## ? dplyr::slice() masks clusterProfiler::slice(), IRanges::slice() ## ? Use the conflicted package ( ) to force all conflicts to become errors
library(ggplot2) library(patchwork) library(pheatmap) library(EnhancedVolcano)
## 載入需要的程序包:ggrepel
library(RColorBrewer) ## 1. 數據預處理 ------------------------------------------------------------ # 假設data是原始計數矩陣,包含兩列(Mock和HSV) rawcount <- data[,2:3] # 更嚴格的基因過濾 - 至少在75%樣本中count>1 keep <- rowSums(rawcount > 1) >= floor(0.75 * ncol(rawcount)) filter_count <- rawcount[keep, ] # 創建DGEList對象 dge <- DGEList(counts = filter_count, group = rep(c("Mock", "HSV"), each = 1)) # TMM標準化并計算CPM值 dge <- calcNormFactors(dge) express_cpm <- cpm(dge, log = TRUE, prior.count = 1) ## 2. 質控可視化 ----------------------------------------------------------- # 箱線圖函數 plot_box <- function(expr_data, title = "Expression Distribution") { expr_data %>% as.data.frame() %>% pivot_longer(everything(), names_to = "sample", values_to = "expression") %>% ggplot(aes(x = sample, y = expression, fill = sample)) + geom_boxplot() + scale_fill_brewer(palette = "Set2") + labs(x = NULL, y = "log2(CPM+1)", title = title) + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) } # PCA圖函數 plot_pca <- function(expr_data, groups) { pca_res <- prcomp(t(expr_data), scale. = TRUE) fviz_pca_ind(pca_res, geom.ind = "point", col.ind = groups, palette = c("#00AFBB", "#E7B800"), addEllipses = TRUE, ellipse.type = "confidence", legend.title = "Groups", title = "PCA - Sample Clustering") + theme_bw() } # 繪制質控圖 p_box <- plot_box(express_cpm) p_pca <- plot_pca(express_cpm, rep(c("Mock", "HSV"), each = 1)) # 使用patchwork組合圖形 p_box + p_pca + plot_layout(widths = c(1, 2))
## Warning: Computation failed in `stat_conf_ellipse()`. ## Caused by error in `if (scale[1] > 0) ...`: ## ! 需要TRUE/FALSE值的地方不可以用缺少值
## 3. 差異表達分析 --------------------------------------------------------- # 轉換基因ID (ENSEMBL到SYMBOL) convert_ids <- function(count_matrix) { ids <- str_split(rownames(count_matrix), "\\.", simplify = TRUE)[,1] id_map <- bitr(ids, fromType = "ENSEMBL", toType = "SYMBOL", OrgDb = org.Mm.eg.db) # 保留唯一映射的基因 id_map <- id_map[!duplicated(id_map$SYMBOL), ] count_matrix %>% as.data.frame() %>% mutate(ENSEMBL = str_split(rownames(.), "\\.", simplify = TRUE)[,1]) %>% inner_join(id_map, by = "ENSEMBL") %>% dplyr::select(-ENSEMBL) %>% column_to_rownames("SYMBOL") %>% as.matrix() } # 轉換ID expr_set <- convert_ids(filter_count)
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(ids, fromType = "ENSEMBL", toType = "SYMBOL", OrgDb = ## org.Mm.eg.db): 0.43% of input gene IDs are fail to map...
# 使用edgeR進行差異分析 run_edger <- function(count_matrix, groups) { dge <- DGEList(counts = count_matrix, group = groups) dge <- calcNormFactors(dge) # 當沒有生物學重復時,使用預設dispersion if (ncol(count_matrix) <= 2) { message("No replicates - using preset dispersion (0.1)") bcv <- 0.1 et <- exactTest(dge, dispersion = bcv^2) } else { dge <- estimateDisp(dge) et <- exactTest(dge) } topTags(et, n = nrow(count_matrix))$table %>% as.data.frame() %>% rownames_to_column("gene") %>% mutate(regulated = case_when( FDR < 0.01 & logFC > 1 ~ "up", FDR < 0.01 & logFC < -1 ~ "down", TRUE ~ "ns" )) } # 執行差異分析 de_results <- run_edger(expr_set, rep(c("Mock", "HSV"), each = 1))
## No replicates - using preset dispersion (0.1)
# 查看差異基因統計 table(de_results$regulated)
## ## down ns up ## 2134 9558 2001
## 4. 結果可視化 ----------------------------------------------------------- # 火山圖 EnhancedVolcano(de_results, lab = de_results$gene, x = "logFC", y = "FDR", pCutoff = 0.01, FCcutoff = 2, title = "HSV vs Mock", subtitle = "Differential Expression", legendPosition = "right")
特別聲明:以上內容(如有圖片或視頻亦包括在內)為自媒體平臺“網易號”用戶上傳并發布,本平臺僅提供信息存儲服務。
Notice: The content above (including the pictures and videos if any) is uploaded and posted by a user of NetEase Hao, which is a social media platform and only provides information storage services.