R/deconv_em_func.r
est_frac.Rd
It calls the nnls package to estimate cell type fractions of bulk data using a pre-estimated signature matrix. It is recommended to keep the row and column names of the input data.
est_frac(sig, bulk)
signature matrix (marker gene x cell type).
bulk data that need to be deconvolved (gene x tissue sample).
A matrix containing the estimated cell type fractions (tissue sample x cell type). Row sums have been normalized to be 1 per sample.
# Reading GTEx brain data: from read count to CPM (count per million)
library(data.table)
gtex_sample = read.delim('https://storage.googleapis.com/gtex_analysis_v7/annotations/GTEx_v7_Annotations_SampleAttributesDS.txt', header = T, stringsAsFactors = F)
gtex_sample = gtex_sample[gtex_sample$SMTS == 'Brain',]
download.file('https://storage.googleapis.com/gtex_analysis_v7/rna_seq_data/GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct.gz', 'GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct.gz')
rnaseq_sample_id = fread('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct.gz', header = T, skip = 2, nrows = 1)
gtex_sample = gtex_sample[gtex_sample$SAMPID %in% colnames(rnaseq_sample_id),]
# read count
gtex_count <- fread('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_reads.gct.gz', header = T, skip = 2, select = c('Description', gtex_sample$SAMPID))
gene_name = gtex_count$Description
gtex_count = as.matrix(as.data.frame(gtex_count[,-1]))
rownames(gtex_count) = gene_name
gtex_cpm = apply(gtex_count, 2, function(x) x/sum(x)*1e6)
dim(gtex_cpm)
#> [1] 56202 1671
gtex_cpm[1:5, 1:2]
#> GTEX-1117F-3226-SM-5N9CT GTEX-111FC-3126-SM-5GZZ2
#> DDX11L1 0.04831143 0.08702695
#> WASH7P 16.78016981 10.80584635
#> MIR1302-11 0.03220762 0.00000000
#> FAM138A 0.01610381 0.01450449
#> OR4G4P 0.01610381 0.01450449
# Reading the signature matrix
signature = read.csv('https://raw.githubusercontent.com/randel/MIND/master/data/Signature_matrix_Darmanis.csv', row.names = 1)
dim(signature)
#> [1] 754 6
head(signature)
#> Astrocyte Endothelial Microglia Excitatory Inhibitory Oligo
#> SLC17A7 0.04039056 0.966799 0.000000 398.5269 1.642269 1.1941426
#> CHN1 322.61389570 34.306107 0.000000 2691.3709 243.566109 28.2254723
#> NPTX1 3.03031572 8.012009 0.000000 582.5650 11.166288 0.1630647
#> RYR2 0.89334183 2.254418 0.000000 306.1922 70.521640 7.1790295
#> MKL2 14.43231849 67.999553 8.433413 198.6433 18.744386 0.9991692
#> ENC1 2.42182261 11.128192 65.242142 784.8072 232.384410 2.6651087
# Running est_frac()
sig = log2(as.matrix(signature) + 1)
bulk = log2(gtex_cpm[rownames(sig),] + 1)
cell_fraction = est_frac(sig, bulk)
#> Loading required package: nnls
#> Astrocyte Endothelial Microglia Excitatory Inhibitory Oligo
#> 0.20 0.17 0.06 0.22 0.26 0.11
# heatmap of the average cell fraction for each brain region: using brain-region-specific reference is recommended
region = substr(gtex_sample$SMTSD, 9, 100)
frac_region = apply(cell_fraction, 2, function(x) tapply(x, region, mean))
suppressMessages(library(NMF))
aheatmap(frac_region, color = 'Blues', treeheight = 0)