16 All gene line bar
16.1 dataset and function
# dataset
<- list("TransPropy" = TransPropy, "deseq2" = deseq2, "edgeR" = edgeR, "limma" = limma, "outRst" = outRst)
datasets <- names(datasets)
data_names
<- c("CST6", "C6orf132", "FABP5P7", "ARL17A", "A2ML1",
genes "AP000892.6", "CBWD3", "GABRE", "ATRIP", "ACAD11",
"FGF22", "DNASE1L2", "AGAP5", "CTD_2231E14.8", "EIF3C",
"AKAP2", "ARPC4_TTLL3", "DNLZ", "C15orf38_AP3S2", "ADAM33",
"AC159540.1", "C17orf49", "AOX1", "ATP5J2_PTCD1", "DKK1",
"FCGR3A", "CTC_231O11.1", "CD79A", "ALX1", "BATF2")
# function
<- function(data_name, data) {
process_data colnames(data) <- gsub("-", "_", colnames(data))
<- paste0(data_name, "_result_all")
result_dir if (!dir.exists(result_dir)) {
dir.create(result_dir)
}
<- list()
all_correlation_results <- list()
all_count_data <- list()
all_hallmarks_count_data <- list()
all_kegg_count_data <- list()
positive_negative_ratio_list
for (gene in genes) {
<- data.frame()
correlation <- colnames(data)
genelist <- as.numeric(data[, gene])
genedata
for (i in 1:length(genelist)) {
<- cor.test(genedata, as.numeric(data[, i]), method = "spearman")
dd 1] <- gene
correlation[i, 2] <- genelist[i]
correlation[i, 3] <- dd$estimate
correlation[i, 4] <- dd$p.value
correlation[i,
}
colnames(correlation) <- c("gene1", "gene2", "cor", "p.value")
<- na.omit(correlation)
correlation <- correlation
all_correlation_results[[gene]]
<- sum(correlation$cor > 0)
total_positive <- sum(correlation$cor < 0)
total_negative <- sum(correlation$cor > 0.5)
positive_above_0_5 <- sum(correlation$cor < -0.5)
negative_below_minus_0_5
<- total_positive / (total_positive + total_negative)
ratio_all_cor_positive <- total_negative / (total_positive + total_negative)
ratio_all_cor_negative <- positive_above_0_5 / (positive_above_0_5 + negative_below_minus_0_5)
ratio_abs_cor_positive <- negative_below_minus_0_5 / (positive_above_0_5 + negative_below_minus_0_5)
ratio_abs_cor_negative
<- tribble(
count_data ~Methods, ~PositiveRatio, ~NegativeRatio,
"AllCor", total_positive, total_negative,
"Abs(Cor)>0.5", positive_above_0_5, negative_below_minus_0_5,
"ratio_AllCor", ratio_all_cor_positive, ratio_all_cor_negative,
"ratio_Abs(Cor)>0.5", ratio_abs_cor_positive, ratio_abs_cor_negative
)
<- count_data
all_count_data[[gene]]
<- correlation$cor
geneList names(geneList) <- correlation$gene2
<- sort(geneList, decreasing = TRUE)
geneList
for (repeat_num in 1:20) {
<- read.gmt("h.all.v7.4.symbols.gmt")
hallmarks <- GSEA(geneList, TERM2GENE = hallmarks, pvalueCutoff = 0.05)
hallmarks_y <- hallmarks_y@result %>% arrange(desc(NES))
sorted_df $core_gene_count <- sapply(strsplit(as.character(sorted_df$core_enrichment), "/"), length)
sorted_df
<- function(s) {
process_string <- sub("^HALLMARK_", "", s)
s <- unlist(strsplit(s, "_"))
words <- tolower(words)
words <- paste(toupper(substring(words, 1, 1)), substring(words, 2), sep = "")
words <- paste(words, collapse = " ")
result return(result)
}
$ID <- sapply(sorted_df$ID, process_string)
sorted_df$Description <- sapply(sorted_df$Description, process_string)
sorted_dfrownames(sorted_df) <- sapply(rownames(sorted_df), process_string)
@result <- sorted_df
hallmarks_y<- names(hallmarks_y@geneSets)
gene_set_names <- sapply(gene_set_names, process_string)
new_gene_set_names names(hallmarks_y@geneSets) <- new_gene_set_names
saveRDS(hallmarks_y, paste0(result_dir, "/", data_name, "_", gene, "_hallmarks_y_", repeat_num, ".rds"))
<- hallmarks_y@result[ , "NES"]
NES_values <- sum(NES_values > 0)
positive_count <- sum(NES_values < 0)
negative_count
<- tribble(
hallmarks_count_data ~Methods, ~PositiveRatio, ~NegativeRatio,
"hallmarks_NES", positive_count, negative_count
)
paste0(gene, "_hallmarks_", repeat_num)]] <- hallmarks_count_data
all_hallmarks_count_data[[
<- positive_count / (negative_count + positive_count)
positive_negative_ratio paste0(gene, "_hallmarks_", repeat_num)]] <- positive_negative_ratio
positive_negative_ratio_list[[
<- read.gmt("c2.cp.kegg.v7.4.symbols.gmt")
hallmarks <- GSEA(geneList, TERM2GENE = hallmarks)
kegg_y <- kegg_y@result %>% arrange(desc(NES))
sorted_df $core_gene_count <- sapply(strsplit(as.character(sorted_df$core_enrichment), "/"), length)
sorted_df
$ID <- sapply(sorted_df$ID, process_string)
sorted_df$Description <- sapply(sorted_df$Description, process_string)
sorted_dfrownames(sorted_df) <- sapply(rownames(sorted_df), process_string)
@result <- sorted_df
kegg_y<- names(kegg_y@geneSets)
gene_set_names <- sapply(gene_set_names, process_string)
new_gene_set_names names(kegg_y@geneSets) <- new_gene_set_names
saveRDS(kegg_y, paste0(result_dir, "/", data_name, "_", gene, "_kegg_y_", repeat_num, ".rds"))
<- kegg_y@result[ , "NES"]
NES_values <- sum(NES_values > 0)
positive_count <- sum(NES_values < 0)
negative_count
<- tribble(
kegg_count_data ~Methods, ~PositiveRatio, ~NegativeRatio,
"kegg_NES", positive_count, negative_count
)
paste0(gene, "_kegg_", repeat_num)]] <- kegg_count_data
all_kegg_count_data[[<- positive_count / (negative_count + positive_count)
positive_negative_ratio paste0(gene, "_kegg_", repeat_num)]] <- positive_negative_ratio
positive_negative_ratio_list[[
}
}
saveRDS(all_correlation_results, paste0(result_dir, "/", data_name, "_all_correlation_results.rds"))
saveRDS(all_count_data, paste0(result_dir, "/", data_name, "_all_count_data.rds"))
saveRDS(all_hallmarks_count_data, paste0(result_dir, "/", data_name, "_all_hallmarks_count_data.rds"))
saveRDS(all_kegg_count_data, paste0(result_dir, "/", data_name, "_all_kegg_count_data.rds"))
saveRDS(positive_negative_ratio_list, paste0(result_dir, "/", data_name, "_positive_negative_ratio_list.rds"))
}
# run function
for (data_name in data_names) {
process_data(data_name, datasets[[data_name]])
}
16.2 load data
<- readRDS(paste0('deseq2_result_all', "/", "deseq2", "_positive_negative_ratio_list.rds"))
deseq2_positive_negative_ratio_list <- readRDS(paste0('TransPropy_result_all', "/", "TransPropy", "_positive_negative_ratio_list.rds"))
TransPropy2_positive_negative_ratio_list <- readRDS(paste0('edgeR_result_all', "/", "edgeR", "_positive_negative_ratio_list.rds"))
edgeR_positive_negative_ratio_list <- readRDS(paste0('limma_result_all', "/", "limma", "_positive_negative_ratio_list.rds"))
limma_positive_negative_ratio_list <- readRDS(paste0('outRst_result_all', "/", "outRst", "_positive_negative_ratio_list.rds")) outRst_positive_negative_ratio_list
16.3 TransPropy
# Convert list to dataframe
<- data.frame(
TransPropy2_positive_negative_ratio_list_data Gene_Type = names(TransPropy2_positive_negative_ratio_list),
Value = unlist(TransPropy2_positive_negative_ratio_list)
)
# Process data using the mutate function
<- TransPropy2_positive_negative_ratio_list_data %>%
TransPropy2_positive_negative_ratio_list_data mutate(Gene = sub("_.*", "", Gene_Type),
Type = sub(".*_(.*)_.*", "\\1", Gene_Type),
Index = as.numeric(sub(".*_", "", Gene_Type)))
# View the organized data
head(TransPropy2_positive_negative_ratio_list_data)
# Plotting
ggplot() +
# Plot line chart, ignoring NaN values
geom_line(data = TransPropy2_positive_negative_ratio_list_data %>% filter(!is.na(Value)),
aes(x = Index, y = Value, color = Type, group = interaction(Gene, Type))) +
geom_point(data = TransPropy2_positive_negative_ratio_list_data %>% filter(!is.na(Value)),
aes(x = Index, y = Value, color = Type, group = interaction(Gene, Type))) +
# Plot a bar chart for NaN values
geom_bar(data = TransPropy2_positive_negative_ratio_list_data %>% filter(is.na(Value)),
aes(x = Index, y = 1, fill = Type), stat = "identity", alpha = 0.5, width = 0.5) +
scale_fill_manual(values = c("hallmarks" = "#F8766D", "kegg" = "#00BFC4")) + # Manually set fill colors
facet_wrap(~ Gene, scales = "free_x") +
labs(title = "Gene Value Distribution with NaN Handling",
x = "Index",
y = "Value") +
ylim(0, 1) + # Set Y-axis range from 0 to 1
theme_minimal() +
theme(legend.position = "bottom")
Transpropy
16.4 deseq2
<- data.frame(
deseq2_positive_negative_ratio_list_data Gene_Type = names(deseq2_positive_negative_ratio_list),
Value = unlist(deseq2_positive_negative_ratio_list)
)
<- deseq2_positive_negative_ratio_list_data %>%
deseq2_positive_negative_ratio_list_data mutate(Gene = sub("_.*", "", Gene_Type),
Type = sub(".*_(.*)_.*", "\\1", Gene_Type),
Index = as.numeric(sub(".*_", "", Gene_Type)))
head(deseq2_positive_negative_ratio_list_data)
ggplot() +
geom_line(data = deseq2_positive_negative_ratio_list_data %>% filter(!is.na(Value)),
aes(x = Index, y = Value, color = Type, group = interaction(Gene, Type))) +
geom_point(data = deseq2_positive_negative_ratio_list_data %>% filter(!is.na(Value)),
aes(x = Index, y = Value, color = Type, group = interaction(Gene, Type))) +
geom_bar(data = deseq2_positive_negative_ratio_list_data %>% filter(is.na(Value)),
aes(x = Index, y = 1, fill = Type), stat = "identity", alpha = 0.5, width = 0.5) +
scale_fill_manual(values = c("hallmarks" = "#F8766D", "kegg" = "#00BFC4")) +
facet_wrap(~ Gene, scales = "free_x") +
labs(title = "Gene Value Distribution with NaN Handling",
x = "Index",
y = "Value") +
theme_minimal() +
theme(legend.position = "bottom")
deseq2
16.5 outRst
<- data.frame(
outRst_positive_negative_ratio_list_data Gene_Type = names(outRst_positive_negative_ratio_list),
Value = unlist(outRst_positive_negative_ratio_list)
)
<- outRst_positive_negative_ratio_list_data %>%
outRst_positive_negative_ratio_list_data mutate(Gene = sub("_.*", "", Gene_Type),
Type = sub(".*_(.*)_.*", "\\1", Gene_Type),
Index = as.numeric(sub(".*_", "", Gene_Type)))
head(outRst_positive_negative_ratio_list_data)
ggplot() +
geom_line(data = outRst_positive_negative_ratio_list_data %>% filter(!is.na(Value)),
aes(x = Index, y = Value, color = Type, group = interaction(Gene, Type))) +
geom_point(data = outRst_positive_negative_ratio_list_data %>% filter(!is.na(Value)),
aes(x = Index, y = Value, color = Type, group = interaction(Gene, Type))) +
geom_bar(data = outRst_positive_negative_ratio_list_data %>% filter(is.na(Value)),
aes(x = Index, y = 1, fill = Type), stat = "identity", alpha = 0.5, width = 0.5) +
scale_fill_manual(values = c("hallmarks" = "#F8766D", "kegg" = "#00BFC4")) +
facet_wrap(~ Gene, scales = "free_x") +
labs(title = "Gene Value Distribution with NaN Handling",
x = "Index",
y = "Value") +
theme_minimal() +
theme(legend.position = "bottom")
WRST
16.6 limma
<- data.frame(
limma_positive_negative_ratio_list_data Gene_Type = names(limma_positive_negative_ratio_list),
Value = unlist(limma_positive_negative_ratio_list)
)
<- limma_positive_negative_ratio_list_data %>%
limma_positive_negative_ratio_list_data mutate(Gene = sub("_.*", "", Gene_Type),
Type = sub(".*_(.*)_.*", "\\1", Gene_Type),
Index = as.numeric(sub(".*_", "", Gene_Type)))
head(limma_positive_negative_ratio_list_data)
ggplot() +
geom_line(data = limma_positive_negative_ratio_list_data %>% filter(!is.na(Value)),
aes(x = Index, y = Value, color = Type, group = interaction(Gene, Type))) +
geom_point(data = limma_positive_negative_ratio_list_data %>% filter(!is.na(Value)),
aes(x = Index, y = Value, color = Type, group = interaction(Gene, Type))) +
geom_bar(data = limma_positive_negative_ratio_list_data %>% filter(is.na(Value)),
aes(x = Index, y = 1, fill = Type), stat = "identity", alpha = 0.5, width = 0.5) +
scale_fill_manual(values = c("hallmarks" = "#F8766D", "kegg" = "#00BFC4")) +
facet_wrap(~ Gene, scales = "free_x") +
labs(title = "Gene Value Distribution with NaN Handling",
x = "Index",
y = "Value") +
theme_minimal() +
theme(legend.position = "bottom")
limma
16.7 edgeR
<- data.frame(
edgeR_positive_negative_ratio_list_data Gene_Type = names(edgeR_positive_negative_ratio_list),
Value = unlist(edgeR_positive_negative_ratio_list)
)
<- edgeR_positive_negative_ratio_list_data %>%
edgeR_positive_negative_ratio_list_data mutate(Gene = sub("_.*", "", Gene_Type),
Type = sub(".*_(.*)_.*", "\\1", Gene_Type),
Index = as.numeric(sub(".*_", "", Gene_Type)))
head(edgeR_positive_negative_ratio_list_data)
ggplot() +
geom_line(data = edgeR_positive_negative_ratio_list_data %>% filter(!is.na(Value)),
aes(x = Index, y = Value, color = Type, group = interaction(Gene, Type))) +
geom_point(data = edgeR_positive_negative_ratio_list_data %>% filter(!is.na(Value)),
aes(x = Index, y = Value, color = Type, group = interaction(Gene, Type))) +
geom_bar(data = edgeR_positive_negative_ratio_list_data %>% filter(is.na(Value)),
aes(x = Index, y = 1, fill = Type), stat = "identity", alpha = 0.5, width = 0.5) +
scale_fill_manual(values = c("hallmarks" = "#F8766D", "kegg" = "#00BFC4")) +
facet_wrap(~ Gene, scales = "free_x") +
labs(title = "Gene Value Distribution with NaN Handling",
x = "Index",
y = "Value") +
theme_minimal() +
theme(legend.position = "bottom")
edgeR
16.8 hallmarks
# Add data source identifiers
<- deseq2_positive_negative_ratio_list_data %>% mutate(Source = "deseq2")
deseq2_positive_negative_ratio_list_data1 <- edgeR_positive_negative_ratio_list_data %>% mutate(Source = "edgeR")
edgeR_positive_negative_ratio_list_data1 <- TransPropy2_positive_negative_ratio_list_data %>% mutate(Source = "TransPropy2")
TransPropy2_positive_negative_ratio_list_data1 <- outRst_positive_negative_ratio_list_data %>% mutate(Source = "outRst")
outRst_positive_negative_ratio_list_data1 <- limma_positive_negative_ratio_list_data %>% mutate(Source = "limma")
limma_positive_negative_ratio_list_data1
# Merge all data frames
<- bind_rows(
all_data1
deseq2_positive_negative_ratio_list_data1,
edgeR_positive_negative_ratio_list_data1,
TransPropy2_positive_negative_ratio_list_data1,
outRst_positive_negative_ratio_list_data1,
limma_positive_negative_ratio_list_data1
)
# Convert the Gene column to a factor and specify the order of factor levels
$Gene <- factor(all_data1$Gene, levels = unique(all_data1$Gene))
all_data1
<- c("deseq2" = "#3273c1", "edgeR" = "#2b8f9a", "TransPropy2" = "#6e4ab4",
colors "outRst" = "#48884d", "limma" = "#a8a74e")
# Handle NaN values, calculate segment heights
<- all_data1 %>%
nan_data filter(is.na(Value)) %>%
group_by(Gene, Type) %>%
mutate(n_sources = n_distinct(Source),
height = 1 / n_sources) %>%
ungroup()
$Gene <- factor(nan_data$Gene, levels = unique(all_data1$Gene))
nan_data
print(nan_data)
ggplot() +
geom_line(data = all_data1 %>% filter(Type == "hallmarks", !is.na(Value)),
aes(x = Gene, y = Value, color = Source, group = interaction(Gene, Source)), size = 1) +
geom_point(data = all_data1 %>% filter(Type == "hallmarks", !is.na(Value)),
aes(x = Gene, y = Value, color = Source, group = interaction(Gene, Source)), size = 3) +
geom_smooth(data = all_data1 %>% filter(Type == "hallmarks" & !is.na(Value)),
aes(x = as.numeric(Gene), y = Value, color = Source, fill = Source), method = "lm", se = TRUE, alpha = 0.2) +
geom_segment(data = nan_data %>% filter(Type == "hallmarks"),
aes(x = Gene, xend = Gene, y = 0, yend = height, color = Source), size = 1, alpha = 0.4) +
scale_color_manual(values = colors) +
scale_fill_manual(values = colors) +
scale_x_discrete(limits = levels(all_data1$Gene)) + # Ensure the X-axis order is consistent with the factor order
labs(title = "Hallmarks Gene Value Distribution",
x = "Gene",
y = "Value") +
ylim(0, 1) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
hallmarks_all
16.9 kegg
<- deseq2_positive_negative_ratio_list_data %>% mutate(Source = "deseq2")
deseq2_positive_negative_ratio_list_data1 <- edgeR_positive_negative_ratio_list_data %>% mutate(Source = "edgeR")
edgeR_positive_negative_ratio_list_data1 <- TransPropy2_positive_negative_ratio_list_data %>% mutate(Source = "TransPropy2")
TransPropy2_positive_negative_ratio_list_data1 <- outRst_positive_negative_ratio_list_data %>% mutate(Source = "outRst")
outRst_positive_negative_ratio_list_data1 <- limma_positive_negative_ratio_list_data %>% mutate(Source = "limma")
limma_positive_negative_ratio_list_data1
<- bind_rows(
all_data1
deseq2_positive_negative_ratio_list_data1,
edgeR_positive_negative_ratio_list_data1,
TransPropy2_positive_negative_ratio_list_data1,
outRst_positive_negative_ratio_list_data1,
limma_positive_negative_ratio_list_data1
)
$Gene <- factor(all_data1$Gene, levels = unique(all_data1$Gene))
all_data1
<- c("deseq2" = "#3273c1", "edgeR" = "#2b8f9a", "TransPropy2" = "#6e4ab4",
colors "outRst" = "#48884d", "limma" = "#a8a74e")
<- all_data1 %>%
nan_data filter(is.na(Value)) %>%
group_by(Gene, Type) %>%
mutate(n_sources = n_distinct(Source),
height = 1 / n_sources) %>%
ungroup() %>%
arrange(Gene, Type, Source) %>%
group_by(Gene) %>%
mutate(cumulative_height = cumsum(ifelse(duplicated(Source), 0, height))) %>%
ungroup() %>%
mutate(y_start = cumulative_height - height,
y_end = cumulative_height)
$Gene <- factor(nan_data$Gene, levels = unique(all_data1$Gene))
nan_data
print(nan_data)
ggplot() +
geom_segment(data = nan_data %>% filter(Type == "kegg"),
aes(x = as.numeric(Gene), xend = as.numeric(Gene), y = y_start, yend = y_end, color = Source),
size = 7, alpha = 0.1) +
geom_smooth(data = all_data1 %>% filter(Type == "kegg" & !is.na(Value)),
aes(x = as.numeric(Gene), y = Value, color = Source, fill = Source), method = "lm", se = TRUE, alpha = 0.2) +
geom_line(data = all_data1 %>% filter(Type == "kegg" & !is.na(Value)),
aes(x = as.numeric(Gene), y = Value, color = Source, group = interaction(Gene, Source)),
size = 1) +
geom_point(data = all_data1 %>% filter(Type == "kegg" & !is.na(Value)),
aes(x = as.numeric(Gene), y = Value, color = Source, group = interaction(Gene, Source)),
size = 3) +
scale_color_manual(values = colors) +
scale_fill_manual(values = colors) +
scale_x_continuous(breaks = 1:length(levels(all_data1$Gene)), labels = levels(all_data1$Gene)) +
labs(title = "Kegg Gene Value Distribution",
x = "Gene",
y = "Value") +
ylim(0, 1) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
kegg_all