TCGA 数据分析实战 —— 突变及拷贝数分析

文章目录

TCGA 数据分析实战 —— 突变及拷贝数分析

前言

在介绍完 TCGAbiolinks 的查询下载和数据分析功能之后，我们简单展示几个示例，来练练手，加深对这个包的理解和使用

我们主要从基因组、转录组和表观组 3 个维度分别举例来进行说明。

基因组分析

拷贝数变异（CNV）在癌症的发生和发展研究中扮演重要的角色。由于基因组重排（如染色体缺失、重复、插入和易位），导致染色体片段的扩增或删失。

CNV 是大于 1kb 的基因组区域发生拷贝数的扩增和删失。

TCGA 的数据可以分为三个等级：

level 1: 原始的测序数据（fasta、fastq等）
level 2: 比对好的 bam 文件
level 3: 经过标准化处理的数据

我们需要获取到 level 3 的数据来进行分析，来识别癌症基因组变异

数据预处理

我们分析的是 legacy 数据库中 Affymetrix Genome-Wide Human SNP Array 6.0 平台的数据

获取 20 个 GBM 和 LGG 样本的 CNV 数据

library(TCGAbiolinks) # 2.30.4 library(tidyverse) query.gbm.nocnv <- GDCquery( project = "TCGA-GBM", data.category = "Copy Number Variation", data.type = "Masked Copy Number Segment", access = "open" ) # 只挑选 20 个样本 query.gbm.nocnv$results[[1]] <- query.gbm.nocnv$results[[1]][1:20,] GDCdownload(query.gbm.nocnv) cnvMatrix <- GDCprepare(query.gbm.nocnv) %>% filter(abs(Segment_Mean) > 0.3) %>% mutate(label = if_else(Segment_Mean < -0.3, 0, 1)) %>% dplyr::select(-Segment_Mean, -GDC_Aliquot) %>% rename_with(~ c("Chromosome", "Start", "End", "Num.of.Markers", "Sample.Name", "Aberration")) %>% mutate(Chromosome = ifelse(Chromosome == "X", 23, Chromosome), Chromosome = ifelse(Chromosome == "Y", 24, Chromosome), Chromosome = as.integer(Chromosome)) %>% dplyr::select(c(5, 1:4, 6)) %>% # 注意要转换为 data.frame 格式 as.data.frame()

查看 cnv 矩阵

cnvMatrix[1:3,1:3] # Sample.Name Chromosome Start # 1 TCGA-16-1056-10A-01D-0517-01 2  # 2 TCGA-16-1056-10A-01D-0517-01 2  # 3 TCGA-16-1056-10A-01D-0517-01 2

识别 recurrent CNV

癌症相关的 CNV 会在许多样本中反复出现，我们使用 GAIA 包来识别这种显著的 recurrent CNV

GAIA 算法基于随机排列的统计检验，同时考虑样本内的显著性和同质性，以重复迭代的方式删除区间内的峰，直至剩下的峰没有显著性为止识别显著的 CNV

处理探针数据，这个需要从 GDC Reference Files 中下载相应版本的探针集合。

将文件读取进来，并进行相应的处理

markersMatrix <- readr::read_tsv( "~/Downloads/snp6.na35.remap.hg38.subset.txt", show_col_types = FALSE ) %>% dplyr::filter(freqcnv == FALSE) %>% # use for GISTIC dplyr::select(1:3) %>% rename_with(~c("Probe.Name", "Chromosome", "Start")) %>% mutate(Chromosome = ifelse(Chromosome == "X", 23, Chromosome), Chromosome = ifelse(Chromosome == "Y", 24, Chromosome), Chromosome = as.integer(Chromosome)) %>% dplyr::filter(!duplicated(Probe.Name)) %>% as.data.frame %>% arrange(Chromosome, Start) # 排序是重要的

运行 gaia，识别 recurrent CNV

library(gaia) set.seed(200) markers_obj <- load_markers(as.data.frame(markersMatrix_filtered)) nbsamples <- length(unique(cnvMatrix$Sample.Name)) cnv_obj <- load_cnv(cnvMatrix, markers_obj, nbsamples) suppressWarnings({ cancer <- "GBM" results <- runGAIA( cnv_obj, markers_obj, output_file_name = paste0("~/Downloads/GAIA_", cancer, "_flt.txt"), # 指定需要分析的变异类型，-1 分析所有的变异 aberrations = -1, # 指定需要分析的染色体，默认为 -1，表示所有染色体 chromosomes = -1, # 使用近似的方式可以加快计算速度 approximation = TRUE, # 设置迭代次数 num_iterations = 5000, threshold = 0.25 ) })

在运行 load_cnv 或 runGAIA 时经常会报错，可以使用下面修改后的代码。

load_cnv <- function(segmentation_matrix, markers_list, num_of_samples){ message("Loading Copy Number Data"); # Detect the chromosomes chromosomes <- as.numeric(sort(unique(names(markers_list)))); chromosomes <- chromosomes[which(!is.na(chromosomes))]; # Detect the aberration kinds aberration_kinds <- 1:length(unique(segmentation_matrix[,6])); names(aberration_kinds) <- sort(unique(segmentation_matrix[,6])); # Detect the samples samples <- 1:num_of_samples; if(!is.numeric(segmentation_matrix[,1])){ sample_names <- unique(segmentation_matrix[,1]); for(i in 1:length(sample_names)){ segmentation_matrix[which(segmentation_matrix[,1]==sample_names[i]),1] <- i; } } region_list <- list(); # Create the final structure list of the returned list for(k in 1:length(aberration_kinds)){ region_list[[ aberration_kinds[k] ]] <- list(); for(i in 1:length(chromosomes) ){ region_list[[ aberration_kinds[k] ]][[ chromosomes[i] ]] <- matrix(0, length(samples), ncol(markers_list[[ chromosomes[i] ]])); rownames(region_list[[ aberration_kinds[k] ]][[ chromosomes[i] ]]) <- samples; colnames(region_list[[ aberration_kinds[k] ]][[ chromosomes[i] ]]) <- c(1:ncol(markers_list[[ chromosomes[i] ]])); } } for(k in 1:length(aberration_kinds)){ ab_ids <- which(segmentation_matrix[,6]==names(aberration_kinds[k])); # In this matrix all regions aberrant as aberration_kinds[k] are stored tmp_matrix1 <- segmentation_matrix[ab_ids,]; if(class(tmp_matrix1)=="numeric"){ tmp_matrix1 <- t(as.matrix(tmp_matrix1)); } for(i in 1:length(chromosomes) ){ # In this matrix all regions aberrant as aberration_kinds[k] for the i-th chromsome are stored tmp_matrix2 <- tmp_matrix1[which(tmp_matrix1[,2]==chromosomes[i]),]; if(class(tmp_matrix2)=="numeric"){ tmp_matrix2 <- t(as.matrix(tmp_matrix2)); } message(".", appendLF = FALSE); for(j in 1:length(samples)){ #In this matrix all regions aberrant as aberration_kinds[k] for the i-th chromsome and for the j-th sample are stored tmp_matrix3 <- tmp_matrix2[which(tmp_matrix2[,1]==samples[j]),]; if(class(tmp_matrix3)=="numeric"){ tmp_matrix3 <- t(as.matrix(tmp_matrix3)); } if(nrow(tmp_matrix3)>0){ for(t in 1:nrow(tmp_matrix3)){ start_prob <- tmp_matrix3[t,3]; end_prob <- tmp_matrix3[t,4]; start_index <- which(markers_list[[ chromosomes[i] ]][1,] == start_prob); end_index <- which(markers_list[[ chromosomes[i] ]][2,] == end_prob); if (is_empty(start_index) | is_empty(end_index)) next region_list[[ aberration_kinds[k] ]][[ chromosomes[i] ]][samples[j], start_index:end_index] <- 1; } } } } } message("\nDone"); names(region_list) <- names(aberration_kinds); return(region_list); } peel_off <- function (pvalues_list, threshold, chromosomes, aberrations, discontinuity, hom_threshold) { regions_list <- list() for (i in 1:length(aberrations)) { region_chromosome_list <- list() cnv_index <- aberrations[i] for (j in 1:length(chromosomes)) { property_list <- list() chromosome_index <- chromosomes[j] message(".", appendLF = FALSE) curr_pvalue <- pvalues_list[[cnv_index]][[chromosome_index]] qvals <- qvalue(curr_pvalue) curr_qvalue <- qvals tmp_qvalue <- curr_qvalue tmp_start <- c() tmp_end <- c() start <- c() end <- c() qval <- c() tmp_pvalue <- curr_pvalue while (min(tmp_qvalue) < threshold) { tmp_list <- search_peaks_in_regions(tmp_qvalue, 1, length(tmp_qvalue), discontinuity[[chromosome_index]], hom_threshold, threshold) # 原来是 tmp_list[[1]] == -1 if (tmp_list[[1]][1] == -1) { break } else { tmp_start <- tmp_list[[1]] tmp_end <- tmp_list[[2]] for (k in 1:length(tmp_start)) { start <- c(start, tmp_start[k]) end <- c(end, tmp_end[k]) qval <- c(qval, tmp_qvalue[tmp_start[k]]) tmp_pvalue[tmp_start[k]:tmp_end[k]] <- 1 } } qvals <- qvalue(tmp_pvalue) tmp_qvalue <- qvals } if (length(start) > 0) { property_list[[1]] <- start property_list[[2]] <- end property_list[[3]] <- qval } else { property_list[[1]] <- -1 property_list[[2]] <- -1 property_list[[3]] <- -1 } region_chromosome_list[[chromosome_index]] <- property_list } regions_list[[cnv_index]] <- region_chromosome_list } message("\nDone") return(regions_list) } myrunGAIA <- function (cnv_obj, markers_obj, output_file_name = "", aberrations = -1, chromosomes = -1, num_iterations = 10, threshold = 0.25, hom_threshold = 0.12, approximation = FALSE) { message("\nPerforming Data Preprocessing") if (chromosomes == -1) { chromosomes <- as.numeric(sort(unique(names(markers_obj)))) chromosomes <- chromosomes[which(!is.na(chromosomes))] } else { known_chr <- as.numeric(names(markers_obj)) if ((length(known_chr[chromosomes]) != length(chromosomes)) || (sum(is.na(known_chr[chromosomes])) > 0)) { error_string <- "Error in the list of chromosomes passed as argument.\n" error_string <- cat(error_string, "The list of the chromosomes that can be analyzed follows:\n", known_chr, "\n") stop(error_string, call. = FALSE) } } if (aberrations == -1) { aberrations <- as.numeric(names(cnv_obj)) names(aberrations) <- names(cnv_obj) if (length(aberrations) > 0 && aberrations[1] == 0) { aberrations <- aberrations + 1 } } else { aberrations <- as.numeric(names(cnv_obj)) names(aberrations) <- names(cnv_obj) if (length(aberrations) > 0 && aberrations[1] == 0) { aberrations <- aberrations + 1 } known_aberr <- as.numeric(names(cnv_obj)) if ((length(known_aberr[aberrations]) != length(aberrations)) || (sum(is.na(known_aberr[aberrations]) > 0))) { error_string <- "Error in the list of aberrations passed as argument.\n" error_string <- cat(error_string, "The aberrations that can be analyzed follow:\n", known_aberr, "\n") stop(error_string, call. = FALSE) } } discontinuity <- list() if (length(aberrations) == 2 && hom_threshold >= 0) { message("\nDone") message("\nComputing Discontinuity Matrix") for (i in 1:length(chromosomes)) { message(".", appendLF = FALSE) tmp <- cnv_obj[[2]][[chromosomes[i]]] - cnv_obj[[1]][[chromosomes[i]]] tmp_vec <- 0 * c(1:(ncol(tmp) - 1)) for (k in 1:(ncol(tmp) - 1)) { for (z in 1:nrow(tmp)) { tmp_vec[k] <- tmp_vec[k] + abs(tmp[z, k] - tmp[z, k + 1]) } } discontinuity[[chromosomes[i]]] <- tmp_vec/nrow(cnv_obj[[2]][[chromosomes[i]]]) } } else { if (length(aberrations) != 2 && hom_threshold >= 0) { message("\nHomogeneous cannot be applied on the data (data must contain exactly two different kinds of aberrations)\n") } for (i in 1:length(chromosomes)) { tmp <- cnv_obj[[1]][[chromosomes[i]]] - cnv_obj[[1]][[chromosomes[i]]] tmp_vec <- 0 * c(1:(ncol(tmp) - 1)) discontinuity[[chromosomes[i]]] <- tmp_vec hom_threshold = -1 } } message("\nDone") null_hypothesis_list <- list() null_hypothesis_chromosome_list <- list() message("Computing Probability Distribution") for (k in 1:length(aberrations)) { null_hypothesis_chromosome_list <- list() for (i in 1:length(chromosomes)) { aberrations_index <- (aberrations[k]) chromosome_index <- chromosomes[i] obs_data <- cnv_obj[[aberrations_index]][[chromosome_index]] message(".", appendLF = FALSE) if (approximation == FALSE) { null_hypothesis_chromosome_list[[chromosome_index]] <- generate_null_hypothesis(obs_data, num_iterations) } if (approximation == TRUE) { null_hypothesis_chromosome_list[[chromosome_index]] <- generate_approx_null_hypothesis(obs_data, num_iterations) } } null_hypothesis_list[[aberrations_index]] <- null_hypothesis_chromosome_list } pvalue_distribution_list <- list() for (k in 1:length(aberrations)) { tmp_pvalue_list <- list() for (i in 1:length(chromosomes)) { message(".", appendLF = FALSE) tmp_pvalue_list[[chromosomes[i]]] <- rev(cumsum(rev(null_hypothesis_list[[aberrations[k]]][[chromosomes[i]]]))) } pvalue_distribution_list[[aberrations[k]]] <- tmp_pvalue_list } message("\nDone") pvalues_list <- list() message("Assessing the Significance of Observed Data") for (k in 1:length(aberrations)) { pvalue_chromosome_list <- list() for (i in 1:length(chromosomes)) { message(".", appendLF = FALSE) aberrations_index <- (aberrations[k]) chromosome_index <- chromosomes[i] curr_pvalue <- pvalue_distribution_list[[aberrations_index]][[chromosome_index]] curr_obs_data <- cnv_obj[[aberrations_index]][[chromosome_index]] obs_freq <- apply(curr_obs_data, 2, sum) pvalue_chromosome_list[[chromosome_index]] <- round(curr_pvalue[obs_freq[] + 1], 7) } pvalues_list[[aberrations_index]] <- pvalue_chromosome_list } message("\nDone") if (length(aberrations) == 2) { gistic_label <- c("Del", "Amp") message("Writing ", paste(output_file_name, ".igv.gistic", sep = ""), " File for Integrative Genomics Viewer (IGV) Tool") gistic <- matrix(nrow = 0, ncol = 8) colnames(gistic) <- c("Type", "Chromosome", "Start", "End", "q-value", "G-score", "average amplitude", "frequency") for (k in 1:length(aberrations)) { tmp_pvalue_list <- list() for (i in 1:length(chromosomes)) { message(".", appendLF = FALSE) curr_qval <- as.numeric(pvalues_list[[aberrations[k]]][[chromosomes[i]]]) curr_qval <- qvalue(curr_qval) start <- 1 for (z in 2:(length(curr_qval))) { if (curr_qval[z - 1] != curr_qval[z]) { end <- z - 1 print_qval <- curr_qval[z - 1] gistic <- rbind(gistic, c(gistic_label[k], chromosomes[i], markers_obj[[chromosomes[i]]][1, start], markers_obj[[chromosomes[i]]][2, end], print_qval, 0, 0, 0)) start <- z } } end <- length(curr_qval) print_qval <- curr_qval[end] gistic <- rbind(gistic, c(gistic_label[k], chromosomes[i], markers_obj[[chromosomes[i]]][1, start], markers_obj[[chromosomes[i]]][2, end], print_qval, 0, 0, 0)) } } write.table(gistic, paste(output_file_name, ".igv.gistic", sep = ""), sep = "\t", col.names = TRUE, row.names = FALSE, eol = "\n", quote = FALSE) message("\nDone") } if (hom_threshold >= 0) { message("Running Homogeneous peel-off Algorithm With Significance Threshold of ", threshold, " and Homogenous Threshold of ", hom_threshold) } else { message("Running Standard peel-off Algorithm With Significance Threshold of ", threshold) } significant_regions_list <- peel_off(pvalues_list, threshold, chromosomes, aberrations, discontinuity, hom_threshold) if (output_file_name != "") { message("\nWriting Output File '", output_file_name, "' Containing the Significant Regions\n", sep = "") results <- write_significant_regions(markers_obj, significant_regions_list, output_file_name, chromosomes, aberrations) message("File '", output_file_name, "' Saved\n", sep = "") } else { results <- write_significant_regions(markers_obj, significant_regions_list, output_file_name, chromosomes, aberrations) } return(results) }

最后识别重复出现的 CNV，并绘制结果图

RecCNV <- as_tibble(results) %>% mutate_at(vars("q-value"), as.numeric) %>% mutate_at(vars(-"q-value"), as.integer) %>% { # 如果 q-value 为 0，则设置其值为数据中的最小非零值 minval = min(.$`q-value`[which(.$`q-value` != 0)]) mutate(., `q-value` = ifelse(`q-value` == 0, minval, `q-value`)) } %>% mutate(score = -log10(`q-value`)) %>% as.data.frame() threshold <- 0.3 gaiaCNVplot(RecCNV,threshold) save(results, RecCNV, threshold, file = paste0("~/Downloads/", cancer,"_CNV_results.rda"))

recurrent CNV 基因注释

在使用 GAIA 识别出癌症中重复出现的基因组区域变异之后，我们需要将其注释到对应的基因。

我们使用 biomaRt 来获取所有人类基因的基因组范围信息，然后将显著变异的基因组区域映射到对应的基因上

library(GenomicRanges) # 注意，这里用的是 hg38 的基因信息， genes <- TCGAbiolinks:::get.GRCh.bioMart(genome = "hg38") %>% filter(gene_name != "" & seqnames %in% paste0('chr', c(1:22, "X", "Y"))) %>% mutate( seqnames = ifelse( seqnames == "X", 23, ifelse(seqnames == "Y", 24, seqnames) ), seqnames = as.integer(seqnames) ) %>% arrange(seqnames, start) %>% dplyr::select(c("gene_name", "seqnames", "start","end")) %>% rename_with(~ c("GeneSymbol", "Chr", "Start", "End")) genes_GR <- makeGRangesFromDataFrame(genes, keep.extra.columns = TRUE) save(genes_GR, genes, file = "~/Downloads/genes_GR.rda", compress = "xz")

将 CNV 区域注释到基因

# 选择需要的列 sCNV <- dplyr::select(RecCNV, c(1:4, 6)) %>% filter(`q-value` <= threshold) %>% arrange(1, 3) %>% rename_with(~c("Chr", "Aberration", "Start", "End", "q-value")) # 转换为 GenomicRanges 格式 sCNV_GR <- makeGRangesFromDataFrame(sCNV, keep.extra.columns = TRUE) # 寻找交叠区间 hits <- findOverlaps(genes_GR, sCNV_GR, type = "within") sCNV_ann <- cbind(sCNV[subjectHits(hits),], genes[queryHits(hits),]) sCNV_ann[1:3,] # Chr Aberration Start End q-value GeneSymbol Chr Start End # 1 1 0   0.0 NOL9 1   # 1.1 1 0   0.0 RNU6-731P 1   # 1.2 1 0   0.0 AL.1 1

格式化输出内容

library(glue) AmpDel_genes <- as_tibble(sCNV_ann, .name_repair = "universal") %>% mutate( AberrantRegion = glue("{Chr...1}:{Start...3}-{End...4}"), GeneRegion = glue("{Chr...7}:{Start...8}-{End...9}") ) %>% dplyr::select(c(6, 2, 5, 10, 11)) %>% mutate(Aberration = if_else( Aberration == 0, "Del", "Amp")) save(RecCNV, AmpDel_genes, file = paste0("~/Downloads/", cancer, "_CNV_results.rda"))

输出结果如下

AmpDel_genes[1:3,] # # A tibble: 3 × 5 # GeneSymbol Aberration q.value AberrantRegion GeneRegion # <chr> <chr> <dbl> <glue> <glue> # 1 NOL9 Del 0.0374 1:- 1:- # 2 RNU6-731P Del 0.0374 1:- 1:- # 3 AL.1 Del 0.0374 1:- 1:-

基因组变异可视化

OncoPrint

下面我们展示如何绘制基因组突变数据，首先，我们使用的是之前介绍过的 complexHeatmap 包的 OncoPrint 函数。

GDCquery_maf 函数用于下载突变数据

query <- GDCquery( project = "TCGA-LGG", data.category = "Simple Nucleotide Variation", access = "open", data.type = "Masked Somatic Mutation", workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking" ) GDCdownload(query) LGGmut <- GDCprepare(query) query <- GDCquery( project = "TCGA-GBM", data.category = "Simple Nucleotide Variation", access = "open", data.type = "Masked Somatic Mutation", workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking" ) GDCdownload(query) GBMmut <- GDCprepare(query) mut <- LGGmut %>% add_row(GBMmut)

并提取胶质瘤相关通路中的基因信息

# 提取胶质瘤通路中基因的突变信息 Glioma_signaling <- as_tibble(TCGAbiolinks:::listEA_pathways) %>% filter(grepl("glioma", Pathway, ignore.case = TRUE)) %>% subset(Pathway == "Glioma Signaling") # 只取 10 个基因的突变信息 Glioma_signaling_genes <- unlist(str_split(Glioma_signaling$Molecules, ","))[1:10] mut <- mut[mut$Hugo_Symbol %in% Glioma_signaling_genes,] # 获取对应的临床信息 gbm_clin <- GDCquery_clinic(project = "TCGA-GBM", type = "Clinical") lgg_clin <- GDCquery_clinic(project = "TCGA-LGG", type = "Clinical") clinical <- gbm_clin %>% add_row(lgg_clin)

绘制突变瀑布图

TCGAvisualize_oncoprint( mut = mut, gene = Glioma_signaling_genes, annotation = clinical[, c("bcr_patient_barcode", "primary_diagnosis", "vital_status", "ethnicity")], annotation.position = "bottom", color=c("background"="#CCCCCC", "DEL"="#fb8072", "INS"="#80b1d3", "SNP"="#fdb462"), label.font.size = 16, rm.empty.columns = FALSE )

circos plot

基因组变异信息也可以使用 circos 图形来展示，我们使用前面介绍的 circlize 包来展示 CNV 和突变信息

CNV 的数据是 GAIA 分析的结果，突变数据使用的是从 GDC 数据库中获取的 LGG 样本的 MAF 文件的结果

首先，处理一下突变数据

# 提取需要的突变类型 mut_type <- c( "Missense_Mutation", "Nonsense_Mutation", "Nonstop_Mutation", "Frame_Shift_Del", "Frame_Shift_Ins" ) mut <- filter(LGGmut, Variant_Classification %in% mut_type) %>% mutate( mut.id = glue("{Chromosome}:{Start_Position}-{End_Position}|{Reference_Allele}"), .before = Hugo_Symbol ) %>% # 只考虑至少在两个样本中发生突变的基因 filter(duplicated(mut.id)) %>% dplyr::select(c("Chromosome","Start_Position","End_Position","Variant_Classification","Hugo_Symbol")) %>% distinct_all() %>% { # 将 typeNames 作为全局变量，后面还要使用 typeNames <<- unique(.$Variant_Classification) type = structure(1:length(typeNames), names = typeNames) mutate(., type = type[Variant_Classification]) } %>% dplyr::select(c(1:3,6,4,5))

CNV 数据处理

load("~/Downloads/GBM_CNV_results.rda") cnv <- as_tibble(RecCNV) %>% # 值考虑小于阈值的区域，threshold = 0.3 filter(`q-value` <= threshold) %>% select(c(1, 3, 4, 2)) %>% # 将染色体名称转换为 chr 开头的字符串 mutate( Chromosome = ifelse( Chromosome == 23, "X", ifelse(Chromosome == 24, "Y", Chromosome) ), Chromosome = paste0("chr", Chromosome) ) %>% mutate(CNV = 1) %>% `names<-`(c("Chromosome","Start_position","End_position","Aberration_Kind","CNV"))

绘制 circos 图

library(circlize) par(mar=c(1,1,1,1), cex=1) circos.initializeWithIdeogram() # 添加 CNV 结果 colors <- c("forestgreen","firebrick") circos.genomicTrackPlotRegion( cnv, ylim = c(0, 1.2), panel.fun = function(region, value, ...) { circos.genomicRect( region, value, ytop.column = 2, ybottom = 0, col = colors[value[[1]] + 1], border = "white" ) cell.xlim = get.cell.meta.data("cell.xlim") circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040") } ) # 添加突变结果 colors <- structure(c("blue","green","red", "gold"), names = typeNames[1:4]) circos.genomicTrackPlotRegion( mut, ylim = c(1.2, 4.2), panel.fun = function(region, value, ...) { circos.genomicPoints( region, value, cex = 0.8, pch = 16, col = colors[value[[2]]], ...) } ) circos.clear() # 添加图例 legend( -0.1, 0.2, bty = "n", y.intersp = 1, c("Amp", "Del"), pch = 15, col = c("firebrick", "forestgreen"), title = "CNVs", text.font = 1, cex = 0.8, title.adj = 0 ) legend( 1, 0.2, bty = "n", y.intersp = 1, names(colors), pch = 20, col = colors, title = "Mutations", text.font = 1, cex = 0.8, title.adj = 0 )

部分区域可视化

我们可以将基因组范围限制在某一条染色体上，来更好地查看该染色体上突变和拷贝数的变异。

可以使用如下图形来展示

par(mar=c(.1, .1, .1, .1),cex=1.5) circos.par( "start.degree" = 90, canvas.xlim = c(0, 1), canvas.ylim = c(0, 1), gap.degree = 270, cell.padding = c(0, 0, 0, 0), track.margin = c(0.005, 0.005) ) circos.initializeWithIdeogram(chromosome.index = "chr17") circos.par(cell.padding = c(0, 0, 0, 0)) # 添加 CNV 结果 colors <- c("forestgreen","firebrick") names(colors) <- c(0,1) circos.genomicTrackPlotRegion( cnv, ylim = c(0, 1.2), panel.fun = function(region, value, ...) { circos.genomicRect( region, value, ytop.column = 2, ybottom = 0, col = colors[value[[1]]], border = "white" ) cell.xlim = get.cell.meta.data("cell.xlim") circos.lines(cell.xlim, c(0, 0), lty = 2, col = "#00000040") } ) # 添加单基因突变结果 gene.mut <- mutate(mut, genes.mut = glue("{Hugo_Symbol}-{type}")) %>% filter(!duplicated(genes.mut)) %>% { n.mut <- table(.$genes.mut) mutate(., num = n.mut[.$genes.mut]) } %>% mutate( genes.num = as.character(glue("{Hugo_Symbol}({num})")), type = type / 2 ) %>% dplyr::select(-(6:8)) colors <- c("blue","green","red","gold") names(colors) <- typeNames[1:4] circos.genomicTrackPlotRegion( gene.mut, ylim = c(0.3, 2.2), track.height = 0.05, panel.fun = function(region, value, ...) { circos.genomicPoints( region, value, cex = 0.4, pch = 16, col = colors[value[[2]]], ... ) } ) circos.genomicTrackPlotRegion( gene.mut, ylim = c(0, 1), track.height = 0.1, bg.border = NA ) i_track = get.cell.meta.data("track.index") circos.genomicTrackPlotRegion( gene.mut, ylim = c(0, 1), panel.fun = function(region, value, ...) { circos.genomicText( region, value, y = 1, labels.column = 3, col = colors[value[[2]]], facing = "clockwise", adj = c(1, 0.5), posTransform = posTransform.text, cex = 0.4, niceFacing = TRUE ) }, track.height = 0.1, bg.border = NA ) circos.genomicPosTransformLines( gene.mut, posTransform = function(region, value) posTransform.text( region, y = 1, labels = value[[3]], cex = 0.4, track.index = i_track + 1 ), direction = "inside", track.index = i_track ) circos.clear() legend( 0, 0.38, bty = "n", y.intersp = 1, c("Amp", "Del"), pch = 15, col = c("firebrick", "forestgreen"), title = "CNVs", text.font = 1, cex = 0.7, title.adj = 0 ) legend( 0, 0.2, bty = "n", y.intersp = 1, names(colors), pch = 16, col = colors, title = "Mutations", text.font = 1, cex = 0.7, title.adj = 0 )

完整代码：https://github.com/dxsbiocc/learn/blob/main/R/TCGA/Mutation_CNV_visualize.R

今天的文章 TCGA 数据分析实战 —— 突变及拷贝数分析分享到此就结束了，感谢您的阅读。

TCGA 数据分析实战 —— 突变及拷贝数分析