Tutorial 2: inferring a sample–cell matrix and reconstructing sample-specific single-cell transcriptomic profiles
CellFreeGMF traces the cellular origins of these cfRNA markers and elucidates their cell-specific alterations based on cell-free transcriptome and single-cell sequencing data.
Preparation
[1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import anndata as ad
import scanpy as sc
[2]:
import CellFreeGMF
Hyperparameter Configuration
[3]:
seed_value = 2025
disease_name = 'PDAC'
current_path = os.path.abspath('')
Load the cfRNA transcriptome matrix
[4]:
def read_sample_cfRNA(disease_name = 'PDAC'):
data_path = current_path + '/data/' + disease_name
data_exp = pd.read_csv(data_path + '/GSE133684_exp_TPM-all.txt', sep='\t')
data_exp = data_exp.set_index(data_exp.columns[0]).T
label_all = pd.read_csv(data_path + '/GSE133684_series_matrix.csv', sep=',')
label_all = label_all.set_index(label_all.columns[0])
label_all['label'] = label_all[label_all.columns[0]].apply(lambda x: 1 if x == 'disease state: PDAC' else 0)
# label_all = label_all.loc[data_exp.index]
data_exp = data_exp.loc[label_all.index]
data_all = {'data_exp': data_exp,
'label': label_all}
return data_all
sample_cfRNA_exp = read_sample_cfRNA(disease_name)
exp_all = sample_cfRNA_exp['data_exp']
exp_all_label = sample_cfRNA_exp['label']['label']
Load the references single-cell transcriptomic data
[5]:
def read_scRNA():
adata = ad.read_h5ad(current_path + '/data/TSP/TabulaSapiens.h5ad')
adata.var.index = adata.var['ensemblid']
if disease_name == 'pregnancy':
adata = adata[adata.obs['gender'] != 'male'].copy()
adata = adata[adata.obs['organ_tissue'] == 'Uterus'].copy()
if disease_name == 'tuberculosis':
adata = adata[adata.obs['organ_tissue'] == 'Lung'].copy()
if disease_name == 'PDAC':
adata = adata[adata.obs['organ_tissue'] == 'Pancreas'].copy()
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.filter_cells(adata, min_genes=100)
return adata
scRNA_adata = read_scRNA()
Visualization of scRNA-seq Data
[6]:
import copy
temp_sc = copy.deepcopy(scRNA_adata)
sc.pl.highly_variable_genes(temp_sc)
sc.tl.pca(temp_sc)
sc.pl.pca_variance_ratio(temp_sc, n_pcs=50, log=True)
sc.pp.neighbors(temp_sc)
sc.tl.umap(temp_sc)
sc.tl.tsne(temp_sc)
sc.pl.umap(temp_sc,
color = 'cell_ontology_class',
frameon = True,
save = '_' + disease_name + '.pdf')
sc.pl.tsne(temp_sc,
color = 'cell_ontology_class',
frameon = True,
save = '_' + disease_name + '.pdf')
WARNING: saving figure to file figures/umap_PDAC.pdf
WARNING: saving figure to file figures/tsne_PDAC.pdf
Target information was loaded for subsequent analysis.
[7]:
SHAP_cfRNA = pd.read_csv(current_path + '/save_data/' + disease_name + '/SHAP_ALL_gene.txt', header=None)
DEG = pd.read_csv(current_path + '/save_data/' + disease_name + '/DEG_res.csv')
DEG = DEG.sort_values(by="logFC", key=lambda x: x.abs(), ascending=False)
[8]:
sample_types = ['normal']
cfRNA_types = ['Target_gene']
for sample_type in sample_types:
for cfRNA_type in cfRNA_types:
if sample_type == 'normal':
if cfRNA_type == 'Target_gene':
target_cfRNA = set(SHAP_cfRNA.iloc[:,0].str.split('.').str[0].tolist()) | set(DEG.iloc[:, 0].str.split('.').str[0].tolist())
sample_cfRNA_exp, cell_exp_adata = CellFreeGMF.filter_with_overlap_gene(exp_all[exp_all_label == 0][list(target_cfRNA)], scRNA_adata)
elif sample_type == 'disease':
if cfRNA_type == 'Target_gene':
target_cfRNA = set(SHAP_cfRNA.iloc[:,0].str.split('.').str[0].tolist()) | set(DEG.iloc[:, 0].str.split('.').str[0].tolist())
sample_cfRNA_exp, cell_exp_adata = CellFreeGMF.filter_with_overlap_gene(exp_all[exp_all_label == 1][list(target_cfRNA)], scRNA_adata)
# starndard
cell_exp_count = pd.DataFrame(data=cell_exp_adata.layers['raw_counts'].toarray(),
index = cell_exp_adata.obs.index,
columns= cell_exp_adata.var.index)
stand_cell_exp = CellFreeGMF.log_cpm(cell_exp_count).T
stand_sample_exp = CellFreeGMF.log_cpm(sample_cfRNA_exp).T
# calculate similarity
cos_cell = cosine_similarity(stand_cell_exp)
Number of overlap genes: 8599
[9]:
cos_cell.shape,stand_cell_exp.shape
[9]:
((13497, 13497), (13497, 8599))
inferring a sample–cell matrix and reconstructing sample-specific single-cell transcriptomic profiles
[ ]:
sample_types = ['normal', 'disease']
cfRNA_types = ['Target_gene']
for sample_type in sample_types:
for cfRNA_type in cfRNA_types:
if sample_type == 'normal':
if cfRNA_type == 'Target_gene':
target_cfRNA = set(SHAP_cfRNA.iloc[:,0].str.split('.').str[0].tolist()) | set(DEG.iloc[:, 0].str.split('.').str[0].tolist())
sample_cfRNA_exp, cell_exp_adata = CellFreeGMF.filter_with_overlap_gene(exp_all[exp_all_label == 0][list(target_cfRNA)], scRNA_adata)
elif sample_type == 'disease':
if cfRNA_type == 'Target_gene':
target_cfRNA = set(SHAP_cfRNA.iloc[:,0].str.split('.').str[0].tolist()) | set(DEG.iloc[:, 0].str.split('.').str[0].tolist())
sample_cfRNA_exp, cell_exp_adata = CellFreeGMF.filter_with_overlap_gene(exp_all[exp_all_label == 1][list(target_cfRNA)], scRNA_adata)
# starndard
cell_exp_count = pd.DataFrame(data=cell_exp_adata.layers['raw_counts'].toarray(),
index = cell_exp_adata.obs.index,
columns= cell_exp_adata.var.index)
stand_cell_exp = CellFreeGMF.log_cpm(cell_exp_count).T
stand_sample_exp = CellFreeGMF.log_cpm(sample_cfRNA_exp).T
# calculate similarity
cos_sample = cosine_similarity(stand_sample_exp)
row_id, col_id = np.diag_indices_from(cos_sample)
cos_sample[row_id, col_id] = 0
Sample_cell = np.random.rand(cos_sample.shape[0], stand_cell_exp.shape[0])
Result_cfRNA_cell, Result_sample_cfRNA, Result_sample_cell = CellFreeGMF.GNMF(sample_cfRNA = stand_sample_exp.values,
cell_cfRNA = stand_cell_exp.values,
sample_sim = cos_sample,
beta = 0.001,
alpha = 0.001,
iter_num = 100)
np.savez(file=current_path + '/save_data/' + disease_name + '/Decon_sampletype_' + sample_type + '_cfRNAtype_' + cfRNA_type + '.npz',
Result_cfRNA_cell = Result_cfRNA_cell,
Result_sample_cfRNA = Result_sample_cfRNA,
Result_sample_cell = Result_sample_cell,
sample_name = stand_sample_exp.index,
cfRNA_name = stand_sample_exp.columns,
cell_name_free_annotation = cell_exp_adata.obs.free_annotation,
cell_name_cell_ontology_class = cell_exp_adata.obs.cell_ontology_class,
cell_id = stand_cell_exp.index,
sample_cfRNA = stand_sample_exp.values,
cell_cfRNA = stand_cell_exp.values)
Number of overlap genes: 8599
GNMF: step=10 obj=816104100.320847 error=1.023134
, last_error=1.023410
, ddd: 0.000276
GNMF: step=20 obj=759958024.195542 error=0.984385
, last_error=0.994942
, ddd: 0.010558
GNMF: step=30 obj=715348310.949574 error=0.909971
, last_error=0.913787
, ddd: 0.003816
GNMF: step=40 obj=701777906.458539 error=0.886454
, last_error=0.888091
, ddd: 0.001636
GNMF: step=50 obj=693960927.356656 error=0.874075
, last_error=0.875069
, ddd: 0.000994
GNMF: step=60 obj=688430543.474555 error=0.866155
, last_error=0.866797
, ddd: 0.000642
GNMF: step=70 obj=684460638.526017 error=0.860976
, last_error=0.861416
, ddd: 0.000440
GNMF: step=80 obj=681300919.284633 error=0.857189
, last_error=0.857525
, ddd: 0.000337
GNMF: step=90 obj=678704319.051207 error=0.854236
, last_error=0.854501
, ddd: 0.000265
GNMF: step=100 obj=676556170.742099 error=0.851878
, last_error=0.852092
, ddd: 0.000214
Number of overlap genes: 8599
GNMF: step=10 obj=812183263.734288 error=1.019844
, last_error=1.021077
, ddd: 0.001233
GNMF: step=20 obj=744220852.194484 error=0.954944
, last_error=0.965675
, ddd: 0.010731
GNMF: step=30 obj=708012077.076432 error=0.898226
, last_error=0.901358
, ddd: 0.003132
GNMF: step=40 obj=695080683.149955 error=0.876718
, last_error=0.878315
, ddd: 0.001597
GNMF: step=50 obj=687787750.317495 error=0.865048
, last_error=0.865910
, ddd: 0.000862
GNMF: step=60 obj=682948350.501579 error=0.858609
, last_error=0.859126
, ddd: 0.000517
GNMF: step=70 obj=678662787.722342 error=0.854175
, last_error=0.854579
, ddd: 0.000404
GNMF: step=80 obj=674686769.361016 error=0.850461
, last_error=0.850806
, ddd: 0.000344
GNMF: step=90 obj=671122257.753963 error=0.847281
, last_error=0.847578
, ddd: 0.000298
GNMF: step=100 obj=667891931.031939 error=0.844493
, last_error=0.844756
, ddd: 0.000263