Source code for cytopus.tl.create

#import networkx as nx


[docs]
def construct_kb(celltype_edges, geneset_gene_edges,geneset_celltype_edges,annotation_dict,metadata_dict=None,save=False, save_path=None):
    '''
    construct a cytopus.kb.KnowledgeBase object
    celltype_edges: list, list of tuples storing the edges of the cell type hierarchy as ('child', 'parent')
    geneset_gene_edges: list, list of tuples storing the edges connecting every gene_set with every gene as ('gene_set','gene')
    geneset_celltype_edges: list, list of tuples storing the edges connecting every gene sets with its cell type as ('gene_set','celltype')
    annotation_dict: dict, containing the gene set names as keys and their annotation names (cellular_process or cellular_identity) as values
    metadata_dict: dict, nested dict containing the gene set names as keys and a dict storing their attributes_categories as keys and corresponding attributes as values
    save: bool, if True saves the data to the path provided in save_path
    save_path: str, path to save the data to (.txt file)
    '''
    import networkx as nx
    #get genes, genesets, celltypes
    genes = list(set([x[1] for x in geneset_gene_edges]))
    genes = [(x,{'class':'gene'}) for x in genes]
    gene_sets = list(set([x[0] for x in geneset_gene_edges]))
    celltypes = list(set([x[0] for x in celltype_edges]).union(set([x[1] for x in celltype_edges])))
    celltypes = [(x,{'class':'cell_type'})for x in celltypes]

    #some sanity checks
    celltypes_in_hierarchy = set([x[0] for x in celltypes])
    celltypes_of_genesets = set([x[1] for x in geneset_celltype_edges])
    set_dif = celltypes_of_genesets - celltypes_in_hierarchy
    if  set_dif != set():
        print('WARNING: missing cell types:',set_dif,'in the cell type hierarchy. Please append cell type hierarchy.')
    else:
        print('all cell types in gene set are contained in the cell type hierarchy')
    
    genesets_in_celltype_edges = set([x[0] for x in geneset_celltype_edges])
    genesets_in_gene_edges = set([x[0] for x in geneset_gene_edges])

    if  genesets_in_celltype_edges != genesets_in_gene_edges:
        print('WARNING: Gene sets in geneset_celltype_edges and geneset_gene_edges are not identical')

    #set edge attributes (important for queries)
    geneset_gene_edges = [x + ({'class':'gene_OF'},) for x in geneset_gene_edges]
    celltype_edges = [x + ({'class':'SUBSET_OF'},) for x in celltype_edges]


    #sort processes and identities
    processes = []
    identities = []

    for i in gene_sets:
        if annotation_dict[i] == 'cellular_process':
            processes.append(i)
        elif annotation_dict[i] == 'cellular_identity':
            identities.append(i)
        else:
            raise(ValueError('all gene sets annotation names should be either cellular_process or cellular_identity'))

    geneset_gene_edges_processes = [x for x in geneset_gene_edges if x[0] in processes]
    geneset_gene_edges_identities = [x for x in geneset_gene_edges if x[0] in identities]
    geneset_celltype_edge_processes = [x + ({'class':'process_OF'},) for x in geneset_celltype_edges if x[0] in processes]
    geneset_celltype_edge_identities = [x + ({'class':'identity_OF'},) for x in geneset_celltype_edges if x[0] in identities]
        
    #construct graph
    G = nx.DiGraph()
    G.add_nodes_from(genes)
    G.add_nodes_from(gene_sets)
    G.add_nodes_from(identities)
    G.add_nodes_from(celltypes)
    G.add_edges_from(geneset_gene_edges_processes)
    G.add_edges_from(geneset_gene_edges_identities)
    G.add_edges_from(celltype_edges)
    G.add_edges_from(geneset_celltype_edge_processes)
    G.add_edges_from(geneset_celltype_edge_identities)

    #set node metadata
    if isinstance(metadata_dict,dict):
        nx.set_node_attributes(G, metadata_dict)
    else:
        print('No metadata dictionary provided (optional), skipping metadata assignment.')
    if save:
        if not isinstance(save_path,str):
            print('WARNING: Please provide save_path if you want to save the data. Skipping saving step.')
        else:
            import pickle
            with open(save_path, 'wb') as f:
                pickle.dump(G, f)
            print('Pickled and saved to:',save_path)
    return KnowledgeBase(graph=G)