Source code for cytopus.tl.create
#import networkx as nx
[docs]
def construct_kb(celltype_edges, geneset_gene_edges,geneset_celltype_edges,annotation_dict,metadata_dict=None,save=False, save_path=None):
'''
construct a cytopus.kb.KnowledgeBase object
celltype_edges: list, list of tuples storing the edges of the cell type hierarchy as ('child', 'parent')
geneset_gene_edges: list, list of tuples storing the edges connecting every gene_set with every gene as ('gene_set','gene')
geneset_celltype_edges: list, list of tuples storing the edges connecting every gene sets with its cell type as ('gene_set','celltype')
annotation_dict: dict, containing the gene set names as keys and their annotation names (cellular_process or cellular_identity) as values
metadata_dict: dict, nested dict containing the gene set names as keys and a dict storing their attributes_categories as keys and corresponding attributes as values
save: bool, if True saves the data to the path provided in save_path
save_path: str, path to save the data to (.txt file)
'''
import networkx as nx
#get genes, genesets, celltypes
genes = list(set([x[1] for x in geneset_gene_edges]))
genes = [(x,{'class':'gene'}) for x in genes]
gene_sets = list(set([x[0] for x in geneset_gene_edges]))
celltypes = list(set([x[0] for x in celltype_edges]).union(set([x[1] for x in celltype_edges])))
celltypes = [(x,{'class':'cell_type'})for x in celltypes]
#some sanity checks
celltypes_in_hierarchy = set([x[0] for x in celltypes])
celltypes_of_genesets = set([x[1] for x in geneset_celltype_edges])
set_dif = celltypes_of_genesets - celltypes_in_hierarchy
if set_dif != set():
print('WARNING: missing cell types:',set_dif,'in the cell type hierarchy. Please append cell type hierarchy.')
else:
print('all cell types in gene set are contained in the cell type hierarchy')
genesets_in_celltype_edges = set([x[0] for x in geneset_celltype_edges])
genesets_in_gene_edges = set([x[0] for x in geneset_gene_edges])
if genesets_in_celltype_edges != genesets_in_gene_edges:
print('WARNING: Gene sets in geneset_celltype_edges and geneset_gene_edges are not identical')
#set edge attributes (important for queries)
geneset_gene_edges = [x + ({'class':'gene_OF'},) for x in geneset_gene_edges]
celltype_edges = [x + ({'class':'SUBSET_OF'},) for x in celltype_edges]
#sort processes and identities
processes = []
identities = []
for i in gene_sets:
if annotation_dict[i] == 'cellular_process':
processes.append(i)
elif annotation_dict[i] == 'cellular_identity':
identities.append(i)
else:
raise(ValueError('all gene sets annotation names should be either cellular_process or cellular_identity'))
geneset_gene_edges_processes = [x for x in geneset_gene_edges if x[0] in processes]
geneset_gene_edges_identities = [x for x in geneset_gene_edges if x[0] in identities]
geneset_celltype_edge_processes = [x + ({'class':'process_OF'},) for x in geneset_celltype_edges if x[0] in processes]
geneset_celltype_edge_identities = [x + ({'class':'identity_OF'},) for x in geneset_celltype_edges if x[0] in identities]
#construct graph
G = nx.DiGraph()
G.add_nodes_from(genes)
G.add_nodes_from(gene_sets)
G.add_nodes_from(identities)
G.add_nodes_from(celltypes)
G.add_edges_from(geneset_gene_edges_processes)
G.add_edges_from(geneset_gene_edges_identities)
G.add_edges_from(celltype_edges)
G.add_edges_from(geneset_celltype_edge_processes)
G.add_edges_from(geneset_celltype_edge_identities)
#set node metadata
if isinstance(metadata_dict,dict):
nx.set_node_attributes(G, metadata_dict)
else:
print('No metadata dictionary provided (optional), skipping metadata assignment.')
if save:
if not isinstance(save_path,str):
print('WARNING: Please provide save_path if you want to save the data. Skipping saving step.')
else:
import pickle
with open(save_path, 'wb') as f:
pickle.dump(G, f)
print('Pickled and saved to:',save_path)
return KnowledgeBase(graph=G)