The following Python script downloads the ogbl-biokg
(i.e., BioKG) knowledge graph from the Open Graph
Benchmark (OGB).
The ogbl-biokg
dataset is described by OGB as
follows:
“The
ogbl-biokg
dataset is a Knowledge Graph (KG), which we created using data from a large number of biomedical data repositories. It contains 5 types of entities: diseases (10,687 nodes), proteins (17,499), drugs (10,533 nodes), side effects (9,969 nodes), and protein functions (45,085 nodes). There are 51 types of directed relations connecting two types of entities, including 38 kinds of drug-drug interactions, 8 kinds of protein-protein interaction, as well as drug-protein, drug-side effect, function-function relations. All relations are modeled as directed edges, among which the relations connecting the same entity types (e.g., protein-protein, drug-drug, function-function) are always symmetric, i.e., the edges are bi-directional.This dataset is relevant to both biomedical and fundamental ML research. On the biomedical side, the dataset allows us to get better insights into human biology and generate predictions that can guide downstream biomedical research. On the fundamental ML side, the dataset presents challenges in handling a noisy, incomplete KG with possible contradictory observations. This is because the
ogbl-biokg
dataset involves heterogeneous interactions that span from the molecular scale (e.g., protein-protein interactions within a cell) to whole populations (e.g., reports of unwanted side effects experienced by patients in a particular country). Further, triplets in the KG come from sources with a variety of confidence levels, including experimental readouts, human-curated annotations, and automatically extracted metadata.”
import numpy as np
import copy
import json
from ogb.linkproppred import LinkPropPredDataset
= LinkPropPredDataset(name = "ogbl-biokg")
dataset
= dataset.get_edge_split()
split_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
train_edge, valid_edge, test_edge = dataset[0] # graph: library-agnostic graph object graph
The edge index in the graph
object is converted into
JSON format to be read into R.
print(graph.keys())
= graph["edge_index_dict"].copy() edge_index
To make the conversion, the dictionary keys must be renamed (i.e., cannot be tuples).
= list(edge_index.keys())
old_keys for old_name in old_keys:
= "--".join(old_name)
new_name = edge_index[old_name]
edge_index[new_name] del edge_index[old_name]
A special numpy encoder is defined, borrowed from this StackOverflow post.
class NumpyEncoder(json.JSONEncoder):
""" Special json encoder for numpy types """
def default(self, obj):
if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
np.int16, np.int32, np.int64, np.uint8,
np.uint16, np.uint32, np.uint64)):return int(obj)
elif isinstance(obj, (np.float_, np.float16, np.float32,
np.float64)):return float(obj)
elif isinstance(obj, (np.ndarray,)):
return obj.tolist()
return json.JSONEncoder.default(self, obj)
Finally, we convert the edge index to JSON and write to a file.
= json.dumps(edge_index, cls = NumpyEncoder)
edge_index_json
with open('inst/extdata/edge_index.json', 'a') as f:
+ '\n') f.write(edge_index_json
Read the JSON dataset saved previously.
# load libraries
library(data.table)
library(purrr)
library(magrittr)
library(rjson)
# load metapaths library
library(metapaths)
# read data
print(getwd())
biokg = fromJSON(file = "inst/extdata/edge_index.json")
Read mappings downloaded from OGB.
mapping_dir = "inst/extdata/biokg_mappings"
read_mapping = function(mapping_path) {
mapping = fread(file.path(mapping_dir, mapping_path))
mapping[, Type := strsplit(mapping_path, "_")[[1]][1]]
return(mapping)
}
mappings = list.files(mapping_dir) %>%
.[grepl("entidx2name", .)] %>%
map_dfr(read_mapping)
mappings = mappings %>% copy() %>%
setnames(c("ent idx", "ent name"), c("Index", "Name")) %>%
setcolorder("Type") %>%
.[, Label := paste(Type, Index, sep = "_")]
Create a function to convert the edge list to a
data.table
. Note that the node IDs are specific to each
type, so we must add a type-specific prefix.
convert_biokg = function(sub_kg, sub_label) {
# split label
split_label = strsplit(sub_label, "--")[[1]]
origin_label = paste(split_label[1], sub_kg[[1]], sep = "_")
destination_label = paste(split_label[3], sub_kg[[2]], sep = "_")
# create data table
kg_dt = data.table(
Origin = origin_label,
Destination = destination_label,
OriginType = split_label[1],
DestinationType = split_label[3],
EdgeType = split_label[2])
}
Now, map the conversion function over the biokg
list.
biokg_edge_list = imap_dfr(biokg, convert_biokg)
biokg_node_list = get_node_list(biokg_edge_list)
head(biokg_edge_list)
Check that the counts of each node type conform with
graph["num_nodes_dict"]
from the Python script.
disease | drug | function | protein | sideeffect |
---|---|---|---|---|
10687 | 10533 | 45085 | 17499 | 9969 |
biokg_node_list[, .N, by = NodeType]
Add node names from mappings
table.
# add node names
biokg_node_list = merge(biokg_node_list, mappings[, .(Label, Name)], by.x = "Node", by.y = "Label", sort = F) %>%
setnames("Name", "NodeName")
Sample the knowledge graph to generate a small, connected test set.
# load igraph library
library(igraph)
# generate graph
biokg_graph = igraph::graph.data.frame(biokg_edge_list, vertices = biokg_node_list, directed = T)
# sample random nodes
set.seed(42)
seed_nodes = sample(biokg_node_list$Node, 5000)
# generate induced subgraph
biokg_sample = igraph::induced.subgraph(graph = biokg_graph, vids = seed_nodes)
# get largest connected component
comp = igraph::components(biokg_sample)
comp_idx = which.max(comp$csize)
lcc = comp$membership %>%
{names(.)[. == comp_idx]}
# generate connected subgraph
biokg_sub = igraph::induced.subgraph(graph = biokg_graph, vids = lcc)
# generate subsamples node and edge lists
sub_edge_list = biokg_edge_list[Origin %in% lcc & Destination %in% lcc]
sub_node_list = biokg_node_list[Node %in% lcc]
# convenience function
lookup = function(node) {
# return(sub_node_list[Node == node, NodeName])
return(biokg_node_list[Node == node, NodeName])
}
Save node list and edge list to file.
Check possible edge types.
Find hub genes.
# calculate degree
sub_degrees = degree(biokg_sub)
# find candidate nodes
candidate_nodes = sub_degrees %>%
.[which(. > 20 & . < 30)] %>% names() %>% .[grepl("protein", .)] %>%
{ sub_node_list[Node %in% ., NodeName] }
cat(paste("HGNC:", candidate_nodes, sep = ""), sep = "\n")
Test similarity between an Alzheimer’s disease (AD)-related drug (donepezil) and an AD-related pathway (“regulation of amyloid fibril formation”).
total_time = 0
for (i in 1:100) {
start_time = Sys.time()
sim_test = get_similarity(
"drug_762", "function_42893",
mp = c("drug", "disease", "protein", "function"),
node_list = biokg_node_list,
edge_list = biokg_edge_list,
verbose = F)
end_time = Sys.time()
time_required = end_time - start_time
total_time = total_time + time_required
}
average_time = total_time/100
message("Computed in ", as.character(average_time), " seconds.")
sim_res = sim_test$OriginPaths %>%
.[.[["function"]] == "function_36342", ]
Visualize identified metapaths in the KG.
sim_nodes = unique(unlist(sim_res))
# generate connected subgraph
sim_graph = igraph::induced.subgraph(graph = biokg_graph, vids = sim_nodes)
# plot subgraph
plot(sim_graph, vertex.size = 15, edge.arrow.size = 0.25)
As a negative control, randomly sample 100 pathways (i.e., nodes of
type "function"
likely not AD-related) and compute the
similarity.
set.seed(42)
# sample 100 pathways
random_neg_pathways = biokg_node_list[NodeType == "function", ] %>%
.[sample(1:nrow(.), 100), ]
neg_tests = list()
neg_sims = data.table()
# for each pathway, compute similarity
total_time = 0
for (i in 1:100) {
message("Pathway #", i, ": ", random_neg_pathways[i, NodeName])
start_time = Sys.time()
sim_neg_test = get_similarity(
"drug_762", random_neg_pathways[i, Node], # "function_29825",
mp = c("drug", "disease", "protein", "function"),
metric = c("pc", "npc", "dwpc"),
node_list = biokg_node_list,
edge_list = biokg_edge_list,
verbose = F)
end_time = Sys.time()
time_required = end_time - start_time
total_time = total_time + time_required
neg_tests = append(neg_tests, sim_neg_test)
neg_sims = rbind(neg_sims, sim_neg_test$Similarity[, Pathway := random_neg_pathways[i, NodeName]])
}
average_time = total_time/100
message("Computed in ", as.character(average_time), " seconds.")
# calculate statistics (mean, median, SD, etc.)
neg_sims[, sum(Similarity == 0), by = "Metric"]
neg_sims[, mean(Similarity), by = "Metric"]
neg_sims[, median(Similarity), by = "Metric"]
neg_sims[, sd(Similarity), by = "Metric"]
# saveRDS(neg_tests, "<insert file path>.RDS")
# fwrite(neg_sims, "<insert file path>.csv")
Again, test set-to-set similarity by computing similarity scores between three AD-related drugs (i.e., donepezil, memantine, and galantamine) and a set of six AD-related pathways.
# set of three AD drugs
alzheimer_drugs = c("drug_762", "drug_1075", "drug_895")
# set of six AD-related pathways
alzheimer_pathways = c("function_36342", "function_26258", "function_21333", "function_42901", "function_17601", "function_36167")
# compare sets
alzheimer_test = compare_sets(
alzheimer_drugs,
alzheimer_pathways,
mp = c("drug", "disease", "protein", "function"),
method = c("minimum", "maximum"),
node_list = biokg_node_list,
edge_list = biokg_edge_list)
As a negative control, test set-to-set similarity by computing similarity scores between the same three AD-related drugs and a set of six randomly subset pathways.
set.seed(1234) # selecting different seed
# randomly subset pathways
random_pathways = biokg_node_list[NodeType == "function", ] %>%
.[sample(1:nrow(.), 6), ]
# compare sets
random_test = compare_sets(
alzheimer_drugs,
random_pathways$Node,
mp = c("drug", "disease", "protein", "function"),
method = c("minimum", "maximum"),
node_list = biokg_node_list,
edge_list = biokg_edge_list)
Example utility function to generate random meta paths of specified lengths; variants are used below.
set.seed(42)
# generate random metapath of specified length
random_mp = function(origin_type, mp_length, edge_types, verbose = T) {
# construct random meta path starting at origin
gen_mp = c(origin_type)
if(verbose) { cat(origin_type) }
# construct meta path
for (k in 1:(mp_length - 1)) {
possible_types = edge_types[OriginType == tail(gen_mp, 1), unique(DestinationType)]
if(length(possible_types) == 0) {
return(random_mp(origin_type, mp_length, edge_types, verbose))
}
next_type = sample(possible_types, 1)
if(verbose) { cat(" >", next_type) }
gen_mp = append(gen_mp, next_type)
}
if(verbose) { cat("\n") }
# return meta path
return(gen_mp)
}
Randomly subsample BioKG by starting at a randomly sampled node and adding all neighbors (irrespective of edge direction) until the desired graph size is reached.
# generate connnected BioKG subgraph of specific size
subsample_biokg = function(sample_size, origin_types = NULL, verbose = F) {
# sample seed node
if(!is.null(origin_types)) {
subgraph_nodes = sample(biokg_node_list[NodeType %in% origin_types, Node], 1)
} else {
subgraph_nodes = sample(biokg_node_list[, Node], 1)
}
subgraph_size = 1
if(verbose) { message("Subgraph Size: ", subgraph_size) }
sub_index = 1
# augment subgraph until subgraph is as large as desired
while (subgraph_size < sample_size) {
# augment subgraph by appending neighbors of i-th node in subgraph
# take neighbors irrespective of destination
new_neighbors = biokg_edge_list %>%
.[Origin == subgraph_nodes[sub_index] | Destination == subgraph_nodes[sub_index]] %>% .[, c(Origin, Destination)]
# append nodes to subgraph
subgraph_nodes = c(subgraph_nodes, new_neighbors) %>% unique()
subgraph_size = length(subgraph_nodes)
if(verbose) { message("Subgraph Size: ", subgraph_size) }
# increment index
sub_index = sub_index + 1
}
# subset nodes to size
subgraph_nodes = subgraph_nodes[1:sample_size]
subgraph_size = length(subgraph_nodes)
if(verbose) { message("Final Subgraph Size: ", subgraph_size) }
# generate node and edge lists of subgraph
sub_edge_list = biokg_edge_list[Origin %in% subgraph_nodes & Destination %in% subgraph_nodes]
sub_node_list = biokg_node_list[Node %in% subgraph_nodes]
# return node and edge list
return(list(Nodes = sub_node_list, Edges = sub_edge_list))
}
Write function to evaluate how execution time of the
get_similarity()
function scales with graph size. Generate
n_graphs
number of random subgraphs of size
graph_size
; for each subgraph, run n_trials
trials (where, in each trial, a random metapath of length
mp_length
is evaluated.
# run experiments on graph of given size
test_size = function(graph_size, mp_length = 3, n_graphs = 10, n_trials = 10, origin_types = NULL, verbose = F) {
# test with the parameters:
# graph_size = 500
# mp_length = 3
# n_graphs = 10
# n_trials = 10
# define overall averages
graph_total_time = 0
graph_average_times = rep(NA, times = n_graphs)
graph_trial_times = list()
# time trial
for (graph_number in 1:n_graphs) {
# print graph number
if(verbose) { message("\nGraph Number: ", graph_number) }
# subsample graph
random_graph = subsample_biokg(graph_size, origin_types = origin_types,
verbose = verbose)
sample_node_list = random_graph$Nodes
sample_edge_list = random_graph$Edges
# generate possible edge types
sample_edge_types = sample_edge_list[, .(OriginType, DestinationType)] %>% unique()
# remove terminal types
sample_edge_types = sample_edge_types[DestinationType %in% OriginType]
if(is.null(origin_types)) {
origin_types = sample_edge_types$OriginType
} else {
origin_types = origin_types[origin_types %in% sample_edge_types$OriginType]
}
if(verbose) {
message("-- Origin Types: ", origin_types)
# print(sample_edge_types)
}
# total time variable
total_time = 0
trial_times = rep(NA, times = n_trials)
# time trial
for (trial_number in 1:n_trials) {
# sample random origin node
random_origin = sample_edge_list[OriginType %in% origin_types, Origin] %>% unique() %>% sample(1)
random_origin_type = sample_node_list[Node == random_origin, NodeType]
# print trial number
if(verbose) { message("-- Trial Number: ", trial_number, " (", random_origin_type, ")") }
# construct random meta path starting at origin
test_mp = random_mp(random_origin_type, mp_length, sample_edge_types, verbose = verbose)
# sample random destination node
random_destination = sample_node_list %>%
.[NodeType == tail(test_mp, 1)] %>%
.[Node == sample(Node, 1), Node]
# time similarity search
start_time = Sys.time()
sim_result = get_similarity(
random_origin, random_destination,
mp = test_mp,
metric = c("dwpc"),
node_list = sample_node_list,
edge_list = sample_edge_list,
verbose = F)
end_time = Sys.time()
# compute time required
time_required = end_time - start_time
trial_times[trial_number] = time_required
total_time = total_time + time_required
}
# compute average time
average_time = total_time / n_trials
# append to graph list
graph_total_time = graph_total_time + average_time
graph_average_times[graph_number] = average_time
graph_trial_times = append(graph_trial_times, list(trial_times))
}
# generate return object
graph_average_time = graph_total_time / n_graphs
names(graph_trial_times) = paste("Graph", 1:n_graphs)
return(list(Average = graph_average_time,
`Graph Averages` = graph_average_times,
Trials = graph_trial_times))
}
Test on randomly generated graphs of various sizes.
# map over sizes
# size_list = c(128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536)
size_list = c(128, 256, 512, 1024, 2048, 4096, 8192, 16384)
time_test = map(size_list, ~test_size(.x, mp_length = 3, n_graphs = 10, n_trials = 10, origin_types = c("disease", "protein", "function"), verbose = T))
Plot execution time by graph size.
library(ggplot2)
# parse times
time_vals = data.table(
Size = rep(size_list, each = 10),
Time = unlist(map(time_test, ~.x$`Graph Averages`)))
# generate statistics
time_stats = time_vals[, .(Mean = mean(Time), SD = sd(Time)), by = Size] %>%
.[, Upper := Mean + SD] %>%
.[, Lower := Mean - SD]
# generate plot
p = ggplot(time_stats, aes(x = Size, y = Mean, fill = log2(Size))) +
geom_bar(stat = "identity", color = "black") +
geom_errorbar(data = time_stats, aes(x = Size, ymin = Lower, ymax = Upper),
width = 0.2, inherit.aes = FALSE) +
scale_fill_gradientn(colors = c("#FFDAB9", "#FBC4AB", "#F8AD9D", "#F4978E", "#F08080")) +
labs(x = "Graph Size", y = "Mean Running Time (seconds)") +
scale_x_continuous(trans = "log2", n.breaks = length(size_list)) +
scale_y_continuous(expand = expansion(mult = c(0, 0.06))) +
theme_bw() +
theme(
axis.title = element_text(face = "bold", size = 14),
axis.text = element_text(size = 12),
legend.position = "None"
)
# save plot
ggsave(filename = "../Figures/size_runtime.pdf", p, width = 8, height = 5, units = "in")
ggsave(filename = "../Figures/size_runtime.svg", p, width = 8, height = 5, units = "in")
ggsave(filename = "../Figures/size_runtime.png", p, width = 8, height = 5, units = "in")
Test how execution time scales with meta path length.
# get all node types
node_types = unique(biokg_node_list$NodeType)
# list possible edge types from unique(biokg_edge_list$EdgeType)
# excluding "drug-sideeffect" since all sideeffect nodes are terminal
# excluding "protein-function" and "function-function" to prevent uninformative "function" x n metapaths
edge_types = c("disease-protein", "drug-disease", "drug-drug", "drug-protein", "protein-protein", "protein-function", "function-function")
edge_types = data.table(Origin = map_chr(strsplit(edge_types, "-"), ~.x[1]),
Destination = map_chr(strsplit(edge_types, "-"), ~.x[2]))
# list origin types
origin_types = unique(edge_types$Origin)
# list destination types
destination_types = unique(edge_types$Destination)
# define random performance evaluation function
# meta path length must be at least 2
random_test = function(mp_length) {
# sample random origin node
random_origin = biokg_node_list[NodeType %in% origin_types][Node == sample(Node, 1), ]
# construct random meta path starting at origin
random_mp = c(random_origin$NodeType)
for (k in 1:(mp_length - 1)) {
next_type = edge_types[Origin == tail(random_mp, 1), Destination] %>% sample(1)
random_mp = append(random_mp, next_type)
}
# sample random destination node
random_destination = biokg_node_list %>%
.[NodeType == tail(random_mp, 1)] %>%
.[Node == sample(Node, 1), ]
# time similarity search
start_time = Sys.time()
sim_result = get_similarity(
random_origin$Node, random_destination$Node,
mp = random_mp,
metric = c("dwpc"),
node_list = biokg_node_list,
edge_list = biokg_edge_list,
verbose = T)
end_time = Sys.time()
# compute time required
time_required = end_time - start_time
# construct return object
out = list(
Origin = random_origin,
Destination = random_destination,
MP = random_mp,
Time = time_required
)
return(out)
}
# test meta paths with lengths 2-4
length_2 = map(1:50, ~random_test(2))
length_3 = map(1:50, ~random_test(3))
length_4 = map(1:50, ~random_test(4))
all_times = c(length_2, length_3, length_4)
# get times
times_2 = map_dbl(length_2, ~.x$Time)
times_3 = map_dbl(length_3, ~.x$Time)
times_4 = map_dbl(length_4, ~.x$Time)