Integrate scRNA datasets based on shared features/metadata#
Show code cell content Hide code cell content
!lamin load test-scrna
import lamindb as ln
import lnschema_bionty as lb
import pandas as pd
import anndata as ad
ln.settings.verbosity = 3 # show hints
ln.track()
Query files based on metadata#
ln.File.filter(tissues__name__icontains="lymph node").distinct().df()
ln.File.filter(cell_types__name__icontains="monocyte").distinct().df()
ln.File.filter(labels__name="female").distinct().df()
Intersect measured genes between two datasets#
file1 = ln.File.filter(description="Conde22").one()
file2 = ln.File.filter(description="10x reference pbmc68k").one()
file1.describe()
file2.describe()
file1_adata = file1.load()
file2_adata = file2.load()
Here we compute shared genes without loading files:
file1_genes = file1.features["var"]
file2_genes = file2.features["var"]
shared_genes = file1_genes & file2_genes
shared_genes.list("symbol")[:10]
We also need to convert the ensembl_gene_id to symbol for file2 so that they can be concatenated:
mapper = (
pd.DataFrame(file2_genes.values_list("ensembl_gene_id", "symbol"))
.drop_duplicates(0)
.set_index(0)[1]
)
mapper.head()
file1_adata.var.rename(index=mapper, inplace=True)
Intersect cell types#
file1_celltypes = file1.cell_types.all()
file2_celltypes = file2.cell_types.all()
shared_celltypes = file1_celltypes & file2_celltypes
shared_celltypes_names = shared_celltypes.list("name")
shared_celltypes_names
We can now subset the two datasets by shared cell types:
file1_adata_subset = file1_adata[
file1_adata.obs["cell_type"].isin(shared_celltypes_names)
]
file1_adata_subset.obs["cell_type"].value_counts()
But when we subset the 2nd file, we don’t see the two cell types, why?
Because they are labeled with synonyms!
file2_adata_subset = file2_adata[
file2_adata.obs["cell_type"].isin(shared_celltypes_names)
]
file2_adata_subset.obs["cell_type"].value_counts()
We can easily standardize them using .map_synonyms
:
file2_adata.obs["cell_type"] = lb.CellType.map_synonyms(file2_adata.obs["cell_type"])
Now we have the two cell types:
file2_adata_subset = file2_adata[
file2_adata.obs["cell_type"].isin(shared_celltypes_names)
]
file2_adata_subset.obs["cell_type"].value_counts()
adata_concat = ad.concat(
[file1_adata_subset, file2_adata_subset],
label="file",
keys=[file1.description, file2.description],
)
adata_concat
adata_concat.obs.value_counts()
Show code cell content Hide code cell content
!lamin delete test-scrna
!rm -r ./test-scrna