Download BioKG

The following Python script downloads the ogbl-biokg (i.e., BioKG) knowledge graph from the Open Graph Benchmark (OGB).

The ogbl-biokg dataset is described by OGB as follows:

“The ogbl-biokg dataset is a Knowledge Graph (KG), which we created using data from a large number of biomedical data repositories. It contains 5 types of entities: diseases (10,687 nodes), proteins (17,499), drugs (10,533 nodes), side effects (9,969 nodes), and protein functions (45,085 nodes). There are 51 types of directed relations connecting two types of entities, including 38 kinds of drug-drug interactions, 8 kinds of protein-protein interaction, as well as drug-protein, drug-side effect, function-function relations. All relations are modeled as directed edges, among which the relations connecting the same entity types (e.g., protein-protein, drug-drug, function-function) are always symmetric, i.e., the edges are bi-directional.

This dataset is relevant to both biomedical and fundamental ML research. On the biomedical side, the dataset allows us to get better insights into human biology and generate predictions that can guide downstream biomedical research. On the fundamental ML side, the dataset presents challenges in handling a noisy, incomplete KG with possible contradictory observations. This is because the ogbl-biokg dataset involves heterogeneous interactions that span from the molecular scale (e.g., protein-protein interactions within a cell) to whole populations (e.g., reports of unwanted side effects experienced by patients in a particular country). Further, triplets in the KG come from sources with a variety of confidence levels, including experimental readouts, human-curated annotations, and automatically extracted metadata.”

import numpy as np
import copy
import json
from ogb.linkproppred import LinkPropPredDataset

dataset = LinkPropPredDataset(name = "ogbl-biokg")

split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
graph = dataset[0] # graph: library-agnostic graph object

The edge index in the graph object is converted into JSON format to be read into R.

print(graph.keys())
edge_index = graph["edge_index_dict"].copy()

To make the conversion, the dictionary keys must be renamed (i.e., cannot be tuples).

old_keys = list(edge_index.keys())
for old_name in old_keys:
    new_name = "--".join(old_name)
    edge_index[new_name] = edge_index[old_name]
    del edge_index[old_name]

A special numpy encoder is defined, borrowed from this StackOverflow post.

class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                            np.int16, np.int32, np.int64, np.uint8,
                            np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32,
                              np.float64)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

Finally, we convert the edge index to JSON and write to a file.

edge_index_json = json.dumps(edge_index, cls = NumpyEncoder)

with open('inst/extdata/edge_index.json', 'a') as f:
    f.write(edge_index_json + '\n')

Parse KG in R

Read the JSON dataset saved previously.

# load libraries
library(data.table)
library(purrr)
library(magrittr)
library(rjson)

# load metapaths library
library(metapaths)

# read data
print(getwd())
biokg = fromJSON(file = "inst/extdata/edge_index.json")

Read mappings downloaded from OGB.

mapping_dir = "inst/extdata/biokg_mappings"

read_mapping = function(mapping_path) {
  mapping = fread(file.path(mapping_dir, mapping_path))
  mapping[, Type := strsplit(mapping_path, "_")[[1]][1]]
  return(mapping)
}

mappings = list.files(mapping_dir) %>%
  .[grepl("entidx2name", .)] %>%
  map_dfr(read_mapping)

mappings = mappings %>% copy() %>%
  setnames(c("ent idx", "ent name"), c("Index", "Name")) %>%
  setcolorder("Type") %>%
  .[, Label := paste(Type, Index, sep = "_")]

Create a function to convert the edge list to a data.table. Note that the node IDs are specific to each type, so we must add a type-specific prefix.

convert_biokg = function(sub_kg, sub_label) {
  
  # split label
  split_label = strsplit(sub_label, "--")[[1]]
  origin_label = paste(split_label[1], sub_kg[[1]], sep = "_")
  destination_label = paste(split_label[3], sub_kg[[2]], sep = "_")
  
  # create data table
  kg_dt = data.table(
    Origin = origin_label,
    Destination = destination_label,
    OriginType = split_label[1],
    DestinationType = split_label[3],
    EdgeType = split_label[2])
  
}

Now, map the conversion function over the biokg list.

biokg_edge_list = imap_dfr(biokg, convert_biokg)
biokg_node_list = get_node_list(biokg_edge_list)
head(biokg_edge_list)

Check that the counts of each node type conform with graph["num_nodes_dict"] from the Python script.

disease drug function protein sideeffect
10687 10533 45085 17499 9969
biokg_node_list[, .N, by = NodeType]

Add node names from mappings table.

# add node names
biokg_node_list = merge(biokg_node_list, mappings[, .(Label, Name)], by.x = "Node", by.y = "Label", sort = F) %>%
  setnames("Name", "NodeName")

Sample the knowledge graph to generate a small, connected test set.

# load igraph library
library(igraph)

# generate graph
biokg_graph = igraph::graph.data.frame(biokg_edge_list, vertices = biokg_node_list, directed = T)

# sample random nodes
set.seed(42)
seed_nodes = sample(biokg_node_list$Node, 5000)

# generate induced subgraph
biokg_sample = igraph::induced.subgraph(graph = biokg_graph, vids = seed_nodes)

# get largest connected component
comp = igraph::components(biokg_sample) 
comp_idx = which.max(comp$csize)
lcc = comp$membership %>%
  {names(.)[. == comp_idx]}

# generate connected subgraph
biokg_sub = igraph::induced.subgraph(graph = biokg_graph, vids = lcc)

# generate subsamples node and edge lists
sub_edge_list = biokg_edge_list[Origin %in% lcc & Destination %in% lcc]
sub_node_list = biokg_node_list[Node %in% lcc]

# convenience function
lookup = function(node) {
  # return(sub_node_list[Node == node, NodeName])
  return(biokg_node_list[Node == node, NodeName])
}

Save node list and edge list to file.

save(sub_edge_list, file = "data/sub_edge_list.rda")
save(sub_node_list, file = "data/sub_node_list.rda")

Test Node Similarity

Check possible edge types.

message("- ", paste(unique(sub_node_list[, NodeType]), collapse = "\n- "))

Find hub genes.

# calculate degree
sub_degrees = degree(biokg_sub)

# find candidate nodes
candidate_nodes = sub_degrees %>%
  .[which(. > 20 & . < 30)] %>% names() %>% .[grepl("protein", .)] %>%
  { sub_node_list[Node %in% ., NodeName] }

cat(paste("HGNC:", candidate_nodes, sep = ""), sep = "\n")

Test similarity between an Alzheimer’s disease (AD)-related drug (donepezil) and an AD-related pathway (“regulation of amyloid fibril formation”).

total_time = 0
for (i in 1:100) {
  start_time = Sys.time()
  sim_test = get_similarity(
    "drug_762", "function_42893",
    mp = c("drug", "disease", "protein", "function"),
    node_list = biokg_node_list,
    edge_list = biokg_edge_list,
    verbose = F)
  end_time = Sys.time()
  time_required = end_time - start_time
  total_time = total_time + time_required
}
average_time = total_time/100
message("Computed in ", as.character(average_time), " seconds.")

sim_res = sim_test$OriginPaths  %>%
  .[.[["function"]] == "function_36342", ]

Visualize identified metapaths in the KG.

sim_nodes = unique(unlist(sim_res))

# generate connected subgraph
sim_graph = igraph::induced.subgraph(graph = biokg_graph, vids = sim_nodes)

# plot subgraph
plot(sim_graph, vertex.size = 15, edge.arrow.size = 0.25)

As a negative control, randomly sample 100 pathways (i.e., nodes of type "function" likely not AD-related) and compute the similarity.

set.seed(42)

# sample 100 pathways
random_neg_pathways = biokg_node_list[NodeType == "function", ] %>%
    .[sample(1:nrow(.), 100), ] 
neg_tests = list()
neg_sims = data.table()

# for each pathway, compute similarity
total_time = 0
for (i in 1:100) {
  
  message("Pathway #", i, ": ", random_neg_pathways[i, NodeName])
  
  start_time = Sys.time()
  sim_neg_test = get_similarity(
  "drug_762", random_neg_pathways[i, Node], # "function_29825",
  mp = c("drug", "disease", "protein", "function"),
  metric = c("pc", "npc", "dwpc"),
  node_list = biokg_node_list,
  edge_list = biokg_edge_list,
  verbose = F)
  end_time = Sys.time()
  
  time_required = end_time - start_time
  total_time = total_time + time_required
  
  neg_tests = append(neg_tests, sim_neg_test)
  neg_sims = rbind(neg_sims, sim_neg_test$Similarity[, Pathway := random_neg_pathways[i, NodeName]])
  
}

average_time = total_time/100
message("Computed in ", as.character(average_time), " seconds.")

# calculate statistics (mean, median, SD, etc.)
neg_sims[, sum(Similarity == 0), by = "Metric"]
neg_sims[, mean(Similarity), by = "Metric"]
neg_sims[, median(Similarity), by = "Metric"]
neg_sims[, sd(Similarity), by = "Metric"]

# saveRDS(neg_tests, "<insert file path>.RDS")
# fwrite(neg_sims, "<insert file path>.csv")

Test Set Similarity

Again, test set-to-set similarity by computing similarity scores between three AD-related drugs (i.e., donepezil, memantine, and galantamine) and a set of six AD-related pathways.

# set of three AD drugs
alzheimer_drugs = c("drug_762", "drug_1075", "drug_895")

# set of six AD-related pathways
alzheimer_pathways = c("function_36342", "function_26258", "function_21333", "function_42901", "function_17601", "function_36167")

# compare sets
alzheimer_test = compare_sets(
  alzheimer_drugs,
  alzheimer_pathways,
  mp = c("drug", "disease", "protein", "function"),
  method = c("minimum", "maximum"),
  node_list = biokg_node_list,
  edge_list = biokg_edge_list)

As a negative control, test set-to-set similarity by computing similarity scores between the same three AD-related drugs and a set of six randomly subset pathways.

set.seed(1234) # selecting different seed

# randomly subset pathways
random_pathways = biokg_node_list[NodeType == "function", ] %>%
  .[sample(1:nrow(.), 6), ]

# compare sets
random_test = compare_sets(
  alzheimer_drugs,
  random_pathways$Node,
  mp = c("drug", "disease", "protein", "function"),
  method = c("minimum", "maximum"),
  node_list = biokg_node_list,
  edge_list = biokg_edge_list)

Package Efficiency

Example utility function to generate random meta paths of specified lengths; variants are used below.

set.seed(42)

# generate random metapath of specified length
random_mp = function(origin_type, mp_length, edge_types, verbose = T) {
  
  # construct random meta path starting at origin
  gen_mp = c(origin_type)
  if(verbose) { cat(origin_type) }
  
  # construct meta path
  for (k in 1:(mp_length - 1)) {
      possible_types = edge_types[OriginType == tail(gen_mp, 1), unique(DestinationType)]
      if(length(possible_types) == 0) {
        return(random_mp(origin_type, mp_length, edge_types, verbose))
      }
      next_type = sample(possible_types, 1)
      if(verbose) { cat(" >", next_type) }
      gen_mp = append(gen_mp, next_type)
    }
  
  if(verbose) { cat("\n") }
  
  # return meta path
  return(gen_mp)
  
}

Randomly subsample BioKG by starting at a randomly sampled node and adding all neighbors (irrespective of edge direction) until the desired graph size is reached.

# generate connnected BioKG subgraph of specific size
subsample_biokg = function(sample_size, origin_types = NULL, verbose = F) {
  
  # sample seed node
  if(!is.null(origin_types)) {
    subgraph_nodes = sample(biokg_node_list[NodeType %in% origin_types, Node], 1)
  } else {
    subgraph_nodes = sample(biokg_node_list[, Node], 1)
  }
  subgraph_size = 1
  if(verbose) { message("Subgraph Size: ", subgraph_size) }
  sub_index = 1
  
  # augment subgraph until subgraph is as large as desired
  while (subgraph_size < sample_size) {
      
      # augment subgraph by appending neighbors of i-th node in subgraph
      # take neighbors irrespective of destination
      new_neighbors = biokg_edge_list %>%
        .[Origin == subgraph_nodes[sub_index] | Destination == subgraph_nodes[sub_index]] %>% .[, c(Origin, Destination)]
      
      # append nodes to subgraph
      subgraph_nodes = c(subgraph_nodes, new_neighbors) %>% unique()
      subgraph_size = length(subgraph_nodes)
      if(verbose) { message("Subgraph Size: ", subgraph_size) }
      
      # increment index
      sub_index = sub_index + 1
      
  }
  
  # subset nodes to size
  subgraph_nodes = subgraph_nodes[1:sample_size]
  subgraph_size = length(subgraph_nodes)
  if(verbose) { message("Final Subgraph Size: ", subgraph_size) }
  
  # generate node and edge lists of subgraph
  sub_edge_list = biokg_edge_list[Origin %in% subgraph_nodes & Destination %in% subgraph_nodes]
  sub_node_list = biokg_node_list[Node %in% subgraph_nodes]
  
  # return node and edge list
  return(list(Nodes = sub_node_list, Edges = sub_edge_list))
  
}

Write function to evaluate how execution time of the get_similarity() function scales with graph size. Generate n_graphs number of random subgraphs of size graph_size; for each subgraph, run n_trials trials (where, in each trial, a random metapath of length mp_length is evaluated.

# run experiments on graph of given size
test_size = function(graph_size, mp_length = 3, n_graphs = 10, n_trials = 10, origin_types = NULL, verbose = F) {
  
  # test with the parameters:
  # graph_size = 500
  # mp_length = 3
  # n_graphs = 10
  # n_trials = 10
  
  # define overall averages
  graph_total_time = 0
  graph_average_times = rep(NA, times = n_graphs)
  graph_trial_times = list()
  
  # time trial
  for (graph_number in 1:n_graphs) {
    
    # print graph number
    if(verbose) { message("\nGraph Number: ", graph_number) }
    
    # subsample graph
    random_graph = subsample_biokg(graph_size, origin_types = origin_types, 
                                   verbose = verbose)
    sample_node_list = random_graph$Nodes
    sample_edge_list = random_graph$Edges
    
    # generate possible edge types
    sample_edge_types = sample_edge_list[, .(OriginType, DestinationType)] %>% unique()
    
    # remove terminal types
    sample_edge_types = sample_edge_types[DestinationType %in% OriginType]
    if(is.null(origin_types)) {
      origin_types = sample_edge_types$OriginType
    } else {
      origin_types = origin_types[origin_types %in% sample_edge_types$OriginType]
    }
    if(verbose) { 
      message("-- Origin Types: ", origin_types)
      # print(sample_edge_types)
    }
    
    # total time variable
    total_time = 0
    trial_times = rep(NA, times = n_trials)
    
    # time trial
    for (trial_number in 1:n_trials) {
    
      # sample random origin node
      random_origin = sample_edge_list[OriginType %in% origin_types, Origin] %>% unique() %>% sample(1)
      random_origin_type = sample_node_list[Node == random_origin, NodeType]
      
      # print trial number
      if(verbose) { message("-- Trial Number: ", trial_number, " (", random_origin_type, ")") }
      
      # construct random meta path starting at origin
      test_mp = random_mp(random_origin_type, mp_length, sample_edge_types, verbose = verbose)
      
      # sample random destination node
      random_destination = sample_node_list %>%
        .[NodeType == tail(test_mp, 1)] %>%
        .[Node == sample(Node, 1), Node]
      
      # time similarity search
      start_time = Sys.time()
      sim_result = get_similarity(
        random_origin, random_destination,
        mp = test_mp,
        metric = c("dwpc"),
        node_list = sample_node_list,
        edge_list = sample_edge_list,
        verbose = F)
      end_time = Sys.time()
      
      # compute time required
      time_required = end_time - start_time
      trial_times[trial_number] = time_required
      total_time = total_time + time_required
      
    }
    
    # compute average time
    average_time = total_time / n_trials
    
    # append to graph list
    graph_total_time = graph_total_time + average_time
    graph_average_times[graph_number] = average_time
    graph_trial_times = append(graph_trial_times, list(trial_times))
  
  }
  
  # generate return object
  graph_average_time = graph_total_time / n_graphs
  names(graph_trial_times) = paste("Graph", 1:n_graphs)
  
  return(list(Average = graph_average_time,
              `Graph Averages` = graph_average_times,
              Trials = graph_trial_times))
  
}

Test on randomly generated graphs of various sizes.

# map over sizes
# size_list = c(128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536)
size_list = c(128, 256, 512, 1024, 2048, 4096, 8192, 16384)
time_test = map(size_list, ~test_size(.x, mp_length = 3, n_graphs = 10, n_trials = 10, origin_types = c("disease", "protein", "function"), verbose = T))

Plot execution time by graph size.

library(ggplot2)

# parse times
time_vals = data.table(
  Size = rep(size_list, each = 10),
  Time = unlist(map(time_test, ~.x$`Graph Averages`)))

# generate statistics
time_stats = time_vals[, .(Mean = mean(Time), SD = sd(Time)), by = Size] %>%
  .[, Upper := Mean + SD] %>%
  .[, Lower := Mean - SD]

# generate plot
p = ggplot(time_stats, aes(x = Size, y = Mean, fill = log2(Size))) +
  geom_bar(stat = "identity", color = "black") +
  geom_errorbar(data = time_stats, aes(x = Size, ymin = Lower, ymax = Upper),
                width = 0.2, inherit.aes = FALSE) +
  scale_fill_gradientn(colors = c("#FFDAB9", "#FBC4AB", "#F8AD9D", "#F4978E", "#F08080")) +
  labs(x = "Graph Size", y = "Mean Running Time (seconds)") +
  scale_x_continuous(trans = "log2", n.breaks = length(size_list)) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.06))) +
  theme_bw() +
  theme(
    axis.title = element_text(face = "bold", size = 14),
    axis.text = element_text(size = 12),
    legend.position = "None"
  )

# save plot
ggsave(filename = "../Figures/size_runtime.pdf", p, width = 8, height = 5, units = "in")
ggsave(filename = "../Figures/size_runtime.svg", p, width = 8, height = 5, units = "in")
ggsave(filename = "../Figures/size_runtime.png", p, width = 8, height = 5, units = "in")

Test how execution time scales with meta path length.

# get all node types
node_types = unique(biokg_node_list$NodeType)

# list possible edge types from unique(biokg_edge_list$EdgeType)
# excluding "drug-sideeffect" since all sideeffect nodes are terminal
# excluding "protein-function" and "function-function" to prevent uninformative "function" x n metapaths
edge_types = c("disease-protein", "drug-disease", "drug-drug", "drug-protein", "protein-protein", "protein-function", "function-function")
  
edge_types = data.table(Origin = map_chr(strsplit(edge_types, "-"), ~.x[1]),
                        Destination = map_chr(strsplit(edge_types, "-"), ~.x[2]))

# list origin types
origin_types = unique(edge_types$Origin)

# list destination types
destination_types = unique(edge_types$Destination)

# define random performance evaluation function
# meta path length must be at least 2
random_test = function(mp_length) {
  
  # sample random origin node
  random_origin = biokg_node_list[NodeType %in% origin_types][Node == sample(Node, 1), ]
  
  # construct random meta path starting at origin
  random_mp = c(random_origin$NodeType)
  for (k in 1:(mp_length - 1)) {
    next_type = edge_types[Origin == tail(random_mp, 1), Destination] %>% sample(1)
    random_mp = append(random_mp, next_type)
  }
  
  # sample random destination node
  random_destination = biokg_node_list %>%
    .[NodeType == tail(random_mp, 1)] %>%
    .[Node == sample(Node, 1), ]
  
  # time similarity search
  start_time = Sys.time()
  sim_result = get_similarity(
    random_origin$Node, random_destination$Node,
    mp = random_mp,
    metric = c("dwpc"),
    node_list = biokg_node_list,
    edge_list = biokg_edge_list,
    verbose = T)
  end_time = Sys.time()
  
  # compute time required
  time_required = end_time - start_time
  
  # construct return object
  out = list(
    Origin = random_origin,
    Destination = random_destination,
    MP = random_mp,
    Time = time_required
  )
  
  return(out)
  
}

# test meta paths with lengths 2-4
length_2 = map(1:50, ~random_test(2))
length_3 = map(1:50, ~random_test(3))
length_4 = map(1:50, ~random_test(4))
all_times = c(length_2, length_3, length_4)

# get times
times_2 =  map_dbl(length_2, ~.x$Time)
times_3 =  map_dbl(length_3, ~.x$Time)
times_4 = map_dbl(length_4, ~.x$Time)