Estimating cell type fractions with a signature matrix using non-negative least squares (NNLS)

It calls the nnls package to estimate cell type fractions of bulk data using a pre-estimated signature matrix. It is recommended to keep the row and column names of the input data.

est_frac(sig, bulk)

Arguments

sig: signature matrix (marker gene x cell type).
bulk: bulk data that need to be deconvolved (gene x tissue sample).

Value

A matrix containing the estimated cell type fractions (tissue sample x cell type). Row sums have been normalized to be 1 per sample.

Examples

# Reading GTEx brain data: from read count to CPM (count per million)
library(data.table)
gtex_sample = read.delim('https://storage.googleapis.com/gtex_analysis_v7/annotations/GTEx_v7_Annotations_SampleAttributesDS.txt', header = T, stringsAsFactors = F)
gtex_sample = gtex_sample[gtex_sample$SMTS == 'Brain',]

download.file('https://storage.googleapis.com/gtex_analysis_v7/rna_seq_data/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct.gz', 'GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct.gz')
rnaseq_sample_id = fread('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct.gz', header = T, skip = 2, nrows = 1)
gtex_sample = gtex_sample[gtex_sample$SAMPID %in% colnames(rnaseq_sample_id),]

# read count
gtex_count <- fread('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct.gz', header = T, skip = 2, select = c('Description', gtex_sample$SAMPID))
gene_name = gtex_count$Description
gtex_count = as.matrix(as.data.frame(gtex_count[,-1]))
rownames(gtex_count) = gene_name
gtex_cpm = apply(gtex_count, 2, function(x) x/sum(x)*1e6)
dim(gtex_cpm)
#> [1] 56202  1671
gtex_cpm[1:5, 1:2]
#>            GTEX-1117F-3226-SM-5N9CT GTEX-111FC-3126-SM-5GZZ2
#> DDX11L1                  0.04831143               0.08702695
#> WASH7P                  16.78016981              10.80584635
#> MIR1302-11               0.03220762               0.00000000
#> FAM138A                  0.01610381               0.01450449
#> OR4G4P                   0.01610381               0.01450449

# Reading the signature matrix
signature = read.csv('https://raw.githubusercontent.com/randel/MIND/master/data/Signature_matrix_Darmanis.csv', row.names = 1)
dim(signature)
#> [1] 754   6
head(signature)
#>            Astrocyte Endothelial Microglia Excitatory Inhibitory      Oligo
#> SLC17A7   0.04039056    0.966799  0.000000   398.5269   1.642269  1.1941426
#> CHN1    322.61389570   34.306107  0.000000  2691.3709 243.566109 28.2254723
#> NPTX1     3.03031572    8.012009  0.000000   582.5650  11.166288  0.1630647
#> RYR2      0.89334183    2.254418  0.000000   306.1922  70.521640  7.1790295
#> MKL2     14.43231849   67.999553  8.433413   198.6433  18.744386  0.9991692
#> ENC1      2.42182261   11.128192 65.242142   784.8072 232.384410  2.6651087

# Running est_frac()
sig = log2(as.matrix(signature) + 1)
bulk = log2(gtex_cpm[rownames(sig),] + 1)

cell_fraction = est_frac(sig, bulk)
#> Loading required package: nnls
#>   Astrocyte Endothelial   Microglia  Excitatory  Inhibitory       Oligo 
#>        0.20        0.17        0.06        0.22        0.26        0.11 

# heatmap of the average cell fraction for each brain region: using brain-region-specific reference is recommended
region = substr(gtex_sample$SMTSD, 9, 100)
frac_region = apply(cell_fraction, 2, function(x) tapply(x, region, mean))

suppressMessages(library(NMF))
aheatmap(frac_region, color = 'Blues', treeheight = 0)