Installation

Read and preprocessing the datasets

The dataset contains control and IFN-beta stimulated cells. We use this as the query dataset.

from platform import python_version

print(python_version())

3.11.7

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import decoupler as dc
import session_info

import matplotlib.pyplot as plt

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
#sc.logging.print_versions()

sc.settings.set_figure_params(dpi=80)
%matplotlib inline

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

/Users/lamine/anaconda3/envs/env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.6 anndata==0.10.4 umap==0.5.5 numpy==1.26.3 scipy==1.12.0 pandas==2.2.0 scikit-learn==1.4.0 statsmodels==0.14.1 pynndescent==0.5.11

results_file = '/path/Data/file_merged.h5ad'  
#the file that will store the analysis results

SS001 = sc.read_10x_mtx(
    '/path/SS001/outs/filtered_gene_bc_matrices/GRCh38',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)
SS002 = sc.read_10x_mtx(
    '/path/SS002/outs/filtered_gene_bc_matrices/GRCh38',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)  
SS003 = sc.read_10x_mtx(
    '/path/SS003/outs/filtered_gene_bc_matrices/GRCh38',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)  
SS004 = sc.read_10x_mtx(
    '/path/SS004/outs/filtered_gene_bc_matrices/GRCh38',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True) 

SS005 = sc.read_10x_mtx(
    '/path/SS005/outs/filtered_gene_bc_matrices/GRCh38',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)
SS006 = sc.read_10x_mtx(
    '/path/SS006/outs/filtered_gene_bc_matrices/GRCh38',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)  
SS007 = sc.read_10x_mtx(
    '/path/SS007/outs/filtered_gene_bc_matrices/GRCh38',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)  
SS008 = sc.read_10x_mtx(
    '/path/SS008/outs/filtered_gene_bc_matrices/GRCh38',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)

file_merged = ad.concat([SS001, SS002, SS003, SS004, SS005, SS006, SS007, SS008], label="batch", 
                            keys=["SS001", "SS002", "SS003", "SS004", "SS005", "SS006", "SS007", "SS008"])
file_merged

file_merged.obs['condition'] = 'WT'
file_merged.obs

Trait = file_merged.obs.batch=='D6'
file_merged.obs.loc[Trait, 'condition'] = 'IRF3&IFNb&vRNA'
file_merged.obs['condition']

file_merged.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
file_merged

Detect variables genes

Variable genes can be detected across the full dataset, but then we run the risk of getting many batch-specific genes that will drive a lot of the variation. Or we can select variable genes from each batch separately to get only celltype variation. In the dimensionality reduction exercise, we already selected variable genes, so they are already stored in file_merged.var.highly_variable

sc.pl.highest_expr_genes(file_merged, n_top=20, )

sc.pp.filter_cells(file_merged, min_genes=200)
sc.pp.filter_genes(file_merged, min_cells=3)

file_merged.var['mt'] = file_merged.var_names.str.startswith('MT-')  
# annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(file_merged, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

file_merged

#leg = ax.legend() 
sc.pl.violin(file_merged, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

file_merged = file_merged[file_merged.obs.n_genes_by_counts < 8500, :]
file_merged = file_merged[file_merged.obs.pct_counts_mt < 20 , :]

file_merged

file_merged.layers['counts']= file_merged.X.copy()
sc.pp.normalize_total(file_merged, target_sum=1e4)

sc.pp.log1p(file_merged)

sc.pp.highly_variable_genes(file_merged, min_mean=0.0125, max_mean=3, min_disp=0.5)

var_genes_all = file_merged.var.highly_variable

print("Highly variable genes: %d"%sum(var_genes_all))

Detect variable genes in each dataset separately using the batch_key parameter.

sc.pp.highly_variable_genes(file_merged, min_mean=0.0125, max_mean=3, min_disp=0.5, batch_key = 'batch')

print("Highly variable genes intersection: %d"%sum(file_merged.var.highly_variable_intersection))

print("Number of batches where gene is variable:")
print(file_merged.var.highly_variable_nbatches.value_counts())

var_genes_batch = file_merged.var.highly_variable_nbatches > 0

Compare overlap of the variable genes.

print("Any batch var genes: %d"%sum(var_genes_batch))
print("All data var genes: %d"%sum(var_genes_all))
print("Overlap: %d"%sum(var_genes_batch & var_genes_all))
print("Variable genes in all batches: %d"%sum(file_merged.var.highly_variable_nbatches == 6))
print("Overlap batch instersection and all: %d"%sum(var_genes_all & file_merged.var.highly_variable_intersection))

Select all genes that are variable in at least 2 datasets and use for remaining analysis.

var_select = file_merged.var.highly_variable_nbatches > 2
var_genes = var_select.index[var_select]
len(var_genes)

sc.pl.highly_variable_genes(file_merged)

file_merged.raw = file_merged

file_merged = file_merged[:, file_merged.var.highly_variable]
sc.pp.regress_out(file_merged, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(file_merged, max_value=10)

Data integration : batch effect correction

BBKNN

First we will run BBKNN that is implemented in scanpy.

sc.tl.pca(file_merged, svd_solver='arpack')
sc.pl.pca_variance_ratio(file_merged, log=True)

file_merged_bbknn = file_merged

sc.external.pp.bbknn(file_merged_bbknn, batch_key='batch', n_pcs=30)

sc.pp.neighbors(file_merged, n_neighbors=10, n_pcs=30)
sc.tl.leiden(file_merged)
sc.tl.paga(file_merged)
sc.pl.paga(file_merged, plot=False)  # remove `plot=False` if you want to see the coarse-grained graph
sc.tl.umap(file_merged, init_pos='paga')
sc.tl.tsne(file_merged)

sc.pp.neighbors(file_merged_bbknn, n_neighbors=10, n_pcs=30)
sc.tl.leiden(file_merged_bbknn)
sc.tl.paga(file_merged_bbknn)
sc.pl.paga(file_merged_bbknn, plot=False)  # remove `plot=False` if you want to see the coarse-grained graph
sc.tl.umap(file_merged_bbknn, init_pos='paga')
sc.tl.tsne(file_merged_bbknn)

We can now plot the un-integrated and the integrated space reduced dimensions.

fig, axs = plt.subplots(2, 2, figsize=(10,8),constrained_layout=True)
sc.pl.tsne(file_merged_bbknn, color="batch", title="BBKNN Corrected tsne", ax=axs[0,0], show=False)
sc.pl.tsne(file_merged, color="batch", title="Uncorrected tsne", ax=axs[0,1], show=False)
sc.pl.umap(file_merged_bbknn, color="batch", title="BBKNN Corrected umap", ax=axs[1,0], show=False)
sc.pl.umap(file_merged, color="batch", title="Uncorrected umap", ax=axs[1,1], show=False)

sc.pl.tsne(file_merged, color='batch', title="Uncorrected umap", wspace=.5)
sc.pl.tsne(file_merged_bbknn, color= 'batch', title="BBKNN Corrected umap", wspace=.5)

sc.tl.rank_genes_groups(file_merged_bbknn, 'batch', method='t-test')
sc.pl.rank_genes_groups(file_merged_bbknn, n_genes=25, sharey=False)

sc.settings.verbosity = 2  # reduce the verbosity
sc.tl.rank_genes_groups(file_merged_bbknn, 'batch', method='wilcoxon')
sc.pl.rank_genes_groups(file_merged_bbknn, n_genes=25, sharey=False)

file_merged_bbknn.write(results_file)

pd.DataFrame(file_merged_bbknn.uns['rank_genes_groups']['names']).head(10)

result = file_merged_bbknn.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'pvals']}).head(10)

sc.tl.rank_genes_groups(file_merged_bbknn, 'condition', groups=['IRF3&IFNb'], reference='IRF3', method='wilcoxon')
sc.pl.rank_genes_groups(file_merged_bbknn, groups=['IRF3&IFNb'], n_genes=20, fontsize=10)

sc.tl.rank_genes_groups(file_merged_bbknn, 'condition', groups=['WT&IFNb'], reference='WT', method='wilcoxon')
sc.pl.rank_genes_groups(file_merged_bbknn, groups=['WT&IFNb'], n_genes=20, fontsize=10)

sc.pl.rank_genes_groups_violin(file_merged_bbknn, groups='WT&IFNb', n_genes=8)

file_merged_bbknn.obs

sc.pl.umap(file_merged_bbknn, color=['condition', 'batch'], wspace=.5)

sc.pl.tsne(file_merged_bbknn, color='leiden', legend_loc='on data', title='', frameon=False, save='.pdf')

file_merged_bbknn

file_merged_bbknn.write(results_file)

Cell type automatic annotation from marker genes

Using decoupler

https://decoupler-py.readthedocs.io/en/latest/notebooks/cell_annotation.html

markers = dc.get_resource('PanglaoDB')
markers

# Filter by canonical_marker and human
markers = markers[(markers['human']=='True')&(markers['canonical_marker']=='True')]

# Remove duplicated entries
markers = markers[~markers.duplicated(['cell_type', 'genesymbol'])]
markers

dc.run_ora(
    mat=file_merged_bbknn,
    net=markers,
    source='cell_type',
    target='genesymbol',
    min_n=3,
    verbose=True
)

file_merged_bbknn

Enrichment with Over Representation Analysis (ORA)

file_merged_bbknn.obsm['ora_estimate']

acts = dc.get_acts(file_merged_bbknn, obsm_key='ora_estimate')

# We need to remove inf and set them to the maximum value observed for pvals=0
acts_v = acts.X.ravel()
max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
acts.X[~np.isfinite(acts.X)] = max_e

acts

sc.pl.umap(acts, color=['NK cells', 'leiden'], cmap='RdBu_r')
sc.pl.violin(acts, keys=['NK cells'], groupby='leiden')

df = dc.rank_sources_groups(acts, groupby='leiden', reference='rest', method='t-test_overestim_var')
df

n_ctypes = 3
ctypes_dict = df.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict

sc.pl.matrixplot(acts, ctypes_dict, 'leiden', dendrogram=True, standard_scale='var',
                 colorbar_title='Z-scaled scores', cmap='RdBu_r')

sc.pl.violin(acts, keys=['T cells', 'B cells', 'Platelets', 'Monocytes', 'NK cells'], groupby='leiden')

annotation_dict = df.groupby('group').head(1).set_index('group')['names'].to_dict()
annotation_dict

file_merged_bbknn.obs['cell_type'] = [annotation_dict[clust] for clust in file_merged_bbknn.obs['leiden']]

# Visualize
sc.pl.umap(file_merged_bbknn, color='cell_type')

file_merged_bbknn.obs

file_merged_bbknn.write(results_file)

session_info.show()

Click to view session information

-----
anndata             0.10.4
decoupler           1.5.0
matplotlib          3.8.2
numpy               1.26.3
pandas              2.2.0
scanpy              1.9.6
session_info        1.0.0
-----

Click to view modules imported as dependencies

PIL                         10.2.0
anyio                       NA
appnope                     0.1.3
asttokens                   NA
attr                        23.1.0
attrs                       23.1.0
babel                       2.11.0
brotli                      1.0.9
certifi                     2023.11.17
cffi                        1.16.0
charset_normalizer          2.0.4
comm                        0.2.1
cycler                      0.12.1
cython_runtime              NA
dateutil                    2.8.2
debugpy                     1.6.7
decorator                   5.1.1
defusedxml                  0.7.1
executing                   2.0.1
fastjsonschema              NA
h5py                        3.10.0
idna                        3.4
ipykernel                   6.29.0
jedi                        0.19.1
jinja2                      3.1.2
joblib                      1.3.2
json5                       NA
jsonschema                  4.19.2
jsonschema_specifications   NA
jupyter_events              0.8.0
jupyter_server              2.10.0
jupyterlab_server           2.25.1
kiwisolver                  1.4.5
llvmlite                    0.41.1
markupsafe                  2.1.3
matplotlib_inline           0.1.6
mpl_toolkits                NA
natsort                     8.4.0
nbformat                    5.9.2
numba                       0.58.1
overrides                   NA
packaging                   23.2
parso                       0.8.3
patsy                       0.5.6
pexpect                     4.8.0
pickleshare                 0.7.5
pkg_resources               NA
platformdirs                4.1.0
prometheus_client           NA
prompt_toolkit              3.0.42
psutil                      5.9.0
ptyprocess                  0.7.0
pure_eval                   0.2.2
pycparser                   2.21
pydev_ipython               NA
pydevconsole                NA
pydevd                      2.9.5
pydevd_file_utils           NA
pydevd_plugins              NA
pydevd_tracing              NA
pygments                    2.17.2
pynndescent                 0.5.11
pyparsing                   3.1.1
pythonjsonlogger            NA
pytz                        2023.3.post1
referencing                 NA
requests                    2.31.0
rfc3339_validator           0.1.4
rfc3986_validator           0.1.1
rpds                        NA
scipy                       1.12.0
send2trash                  NA
six                         1.16.0
sklearn                     1.4.0
sniffio                     1.3.0
socks                       1.7.1
stack_data                  0.6.2
statsmodels                 0.14.1
threadpoolctl               3.2.0
tornado                     6.3.3
tqdm                        4.66.1
traitlets                   5.14.1
typing_extensions           NA
umap                        0.5.5
urllib3                     1.26.18
wcwidth                     0.2.13
websocket                   0.58.0
yaml                        6.0.1
zmq                         25.1.2

-----
IPython             8.20.0
jupyter_client      8.6.0
jupyter_core        5.5.0
jupyterlab          4.0.8
notebook            7.0.6
-----
Python 3.11.7 (main, Dec 15 2023, 12:09:56) [Clang 14.0.6 ]
macOS-14.2.1-arm64-arm-64bit
-----
Session information updated at 2024-01-30 16:02

Keys	Action
`?`	Open this help
`n`	Next page
`p`	Previous page
`s`	Search