Investigating protein-protein interaction networks

Code
here::i_am("03-protein_protein_interactions.qmd")
here() starts at /home/training/EBI-workshop---Extracting-information-from-omics-data

So the idea here is to look if we see more genes encoding proteins that interact with each other then we would expect to observe by chance. If we do, then this could imply some directed biological change in a pathway in that condition/celltype.

The common way this is used is again with single cell seq data on celltype-specific differentially expressed genes by condition. So as an example, if we saw a bunch of interacting proteins in microglia from a AD differential expression, we might want to look more closely at what pathways those particular genes are associated with to try and pin-point some more specific biology underpinning the disease that could be investigated further.

Load packages

Code
library(igraph)
library(stringr)
library(DT)
library(parallel)
library(assertthat)
library(biomaRt)
library(dplyr)
library(purrr)
library(furrr)
library(clusterProfiler)
library(org.Hs.eg.db)
library(future)
library(enrichplot)
library(patchwork)
library(ggplot2)
library(readr)

Functions

Credit to Jimena for these functions, and indeed the basis for almost all the subsequent code!

Code
# Rescale data to range 0 to 1.
rescale01 <- function(numvect){
  rango <- range(numvect)
  rescalado <- unlist(lapply(numvect, function(x) if((x - rango[1]) == 0) { 0 } else {(x - rango[1]) / (rango[2] - rango[1])}))
  return(rescalado)
}

similar_gene_list  <- function(gene_list, attr_matrix, rango = 100){
  # attr_matrix must have at least a column with ensembl_gene_id and the rownames ordered by the attribute
  d_gl <- as.numeric(rownames(attr_matrix)[attr_matrix$ensembl_gene_id %in% gene_list])
  # Random sample of genes (most with a similar attribute)
  rango2 <- setdiff(seq(-rango/2, rango/2, 1), 0)
  rnd_ind <- sample(rango2, size = length(gene_list), replace = TRUE) 
  rnd_ind <- d_gl + rnd_ind
  # check the extremes (all should be within the range of D)
  if(length(which(rnd_ind < 1)) > 0) {
    rnd_ind[which(rnd_ind < 1)] <- sample(1:rango, 1)
  }
  if(length(which(rnd_ind > nrow(attr_matrix)))){
    rnd_ind[which(rnd_ind > nrow(attr_matrix))] <- sample(nrow(attr_matrix):(nrow(attr_matrix) - rango), 1)
  }
  # remove duplicates and replace with random genes (to keep gene number constant)
  if(length(rnd_ind) != length(unique(rnd_ind))){
    rnd_ind <- unique(rnd_ind)
    rnd_ind <- c(rnd_ind, sample(setdiff(1:nrow(attr_matrix), rnd_ind), size = (length(gene_list) - length(rnd_ind))))
  }
  rnd_smpl <- as.character(attr_matrix$ensembl_gene_id[rnd_ind])
  return(rnd_smpl)
}

# Divide genes according to an attribute #
attribute_2_list20bins <- function(attr_matrix, gene_col = 1, attr_col = 2, do_log = TRUE){
  if(do_log == TRUE){
    at <- rescale01(log(attr_matrix[, attr_col] + 1))  
  } else{
    at <- rescale01(attr_matrix[, attr_col])
  }
  bines <- seq(0, 1, .05)
  gps <- list()
  for(k in c(1:(length(bines) - 1))){
     if(k == 1){
      gps[[k]] <- as.character(union(attr_matrix[which(at <= bines[k]), gene_col],
                                     attr_matrix[which(at < bines[k + 1]), gene_col]))
     } else {
       if(k == 20){
         gps[[k]] <- as.character(intersect(attr_matrix[which(at >= bines[k]), gene_col], 
                                            attr_matrix[which(at <= bines[k +  1]), gene_col]))  
         } else {
           gps[[k]] <- as.character(intersect(attr_matrix[which(at >= bines[k]), gene_col],
                                              attr_matrix[which(at < bines[k + 1]), gene_col]))
           }
     }
  }
  return(gps)
}

Read in data

Let’s get our gene lists

Code
# read data
# Get DESeq2 results
res <- readr::read_tsv(here::here("data/ppi/differential_genes.tsv")) |>
  na.omit()

Data prep

And then filter to significant genes

Code
# Filter to sig genes
res_sig <- res |>
  dplyr::filter(min_padj < 0.05)

# Assert that there are no NAs in ensembl IDs
assertthat::are_equal(sum(is.na(res_sig$gene)), 0)
[1] TRUE

Note the use of the assertthat package here! This package effectively just throws an error if the boolean given to it isn’t true.

The purpose of this is similar to the idea of unit tests in software development, whereby tests are written into code/into automated checks run against the code to ensure it works as expected. It’s effectively a more defensive coding strategy.

For data analysis it’s nice to explicitly state what we expect to be true of our data at a given stage of the analysis. Perhaps you already have a habit of running little bits of code in the console to check that things look the way you expect them to?

In this particular case, we don’t want any missing ensembl IDs for our downstream analysis as this would cause errors. We could do nothing here and wait to encounter an error later, but it’s better to state that it should be true, and deliberately invoke the error here if it isn’t. This makes it immediately obvious what the issue is and we can debug, instead of having to backtrack through later code to try and locate the issue.

Background list

We want a background list of genes to be able to test if our contrasts have more interacting proteins than we would expect by chance. Here we’ll use the full list of genes we have in these samples.

Can you add an assertthat test to make sure we have no duplicated genes in our background?

Code
background_population <- readr::read_lines(here::here("data/ppi/background_genes.txt"))
assertthat::assert_that(sum(duplicated(background_population)) == 0)
[1] TRUE

Our protein-protein-interaction network is human so let’s convert our ensembl IDs from mouse to human.

Once again biomart isn’t playing ball so let’s use the same manual download again.

Code
ensembl <- useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
mouse <- useEnsembl(biomart="ensembl", dataset="mmusculus_gene_ensembl")

human_genes <- getLDS(attributes = c("ensembl_gene_id"),
                      filters = "ensembl_gene_id",
                      values = res_sig$gene,
                      mart = mouse,
                      attributesL = c("hsapiens_homolog_ensembl_gene"),
                      martL = ensembl)
Code
mouse_ids <- readr::read_tsv(here::here("data/mouse_mart_export.txt")) |>
  janitor::clean_names()
Rows: 63042 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): Gene stable ID, Human gene stable ID

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
df <- res_sig |>
  dplyr::left_join(mouse_ids, by = join_by(gene == gene_stable_id),
                   relationship = "many-to-many") |>
  # Keep only one instance of duplicates
  dplyr::filter(
    !duplicated(human_gene_stable_id) |
      duplicated(human_gene_stable_id, fromLast = TRUE)
  )

# Convert background
human_background <-
  mouse_ids[mouse_ids$gene_stable_id %in% background_population,]$human_gene_stable_id |>
  unique()
human_background <- human_background[!is.na(human_background)]

We also want gene length data since larger genes would theoretically have more domains and therefore more potential interactions, so by getting the gene length we can try to account for this in our analysis.

This next chunk can take a little while to run so I’ll leave the eval as false and feel free to just skip running it and use the saved data.

Code
# Note that it can take a while to download the gene lists, so I've set eval 
# to false for this chunk.
ensembl = useEnsembl(biomart="ensembl")
human <- useDataset("hsapiens_gene_ensembl", mart = ensembl)

# CDS length # 
cds <- getBM(attributes = c("ensembl_gene_id", "cds_length"),
            mart = mouse)
colnames(cds) <- c("ensembl_gene_id", "cds_length")
cds <- cds[-which(is.na(cds$cds_length)== TRUE), ]
cds <- cds[order(cds$ensembl_gene_id), ]
cds_max <- aggregate.data.frame(cds, by = list(cds$ensembl_gene_id), FUN = max)
cds_max <- cds_max[, c("ensembl_gene_id", "cds_length")]
cds_max <- cds_max[cds_max$ensembl_gene_id %in% human_background, ]
write.table(cds_max, here::here("data/ppi/EnsemblGeneID_MaxCDSlength.txt"), 
            sep = "\t", quote = FALSE, row.names = FALSE, col.names = FALSE)