Creating a sample knowledge base

[ ]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout

[4]:
#path to dataframe containing genes in every gene set
DATA_DIR =  #path to directory containing .csv files for knowledge base
gene_sets_path = DATA_DIR + '/Cytopus_1.31nc_gene-sets_x_genes.csv'
#path to dataframe containing metadata about the gene sets
metadata_path = DATA_DIR + '/Cytopus_1.31nc_versions_metadata.csv'
#path tto dataframe containing the cell type hierarchy
cellular_hierarchies_path = DATA_DIR + '/Cytopus_1.31nc_hierarchies.csv'

preparing data

construct cell type hierarchy

We first construct the hierarchy of cell types to which we will then attach the genes sets. For this we need a list of cell types and a list of tuples indicating the all edges between cell types in the format:

celltype_edges = [(‘celltype_child_1’,’celltype_parent_1’),(‘celltype_child_2’,’celltype_parent_2’),…]

[5]:
#here we import this information from a dataframe storing the child nodes in column index 0 and parent nodes in column index 1.
cellular_hierarchies= pd.read_csv(cellular_hierarchies_path)
cellular_hierarchies.head()
[5]:
parent child
0 all-cells leukocyte
1 leukocyte M
2 leukocyte TNK
3 TNK ILC
4 leukocyte B
[6]:
 #transform information in dataframe to list of tuples
 celltype_edges = list(zip(list(cellular_hierarchies['child']),list(cellular_hierarchies['parent'])))
 celltype_edges[:10]
[6]:
[('leukocyte', 'all-cells'),
 ('M', 'leukocyte'),
 ('TNK', 'leukocyte'),
 ('ILC', 'TNK'),
 ('B', 'leukocyte'),
 ('B-naive', 'B'),
 ('epi', 'all-cells'),
 ('carcinoma-cell', 'epi'),
 ('NSCLC-carcinoma-cell', 'carcinoma-cell'),
 ('lung-epi', 'epi')]

get gene sets and metadata

We first connect genes with their respective gene sets by edges. For this we format the data as it follows:

geneset_gene_edges = [(‘gene_set_1’,’gene_1’),(‘gene_set_2’,’gene_2’),..]

[7]:
#here we load the genes belong to each gene set
gene_sets = pd.read_csv(gene_sets_path)
gene_sets.head()

[7]:
gene_set_name gene_name
0 leuko_transendothelial-migration ESAM
1 leuko_transendothelial-migration MYLPF
2 leuko_transendothelial-migration CTNNA2
3 leuko_transendothelial-migration MYL12A
4 leuko_transendothelial-migration TXK
[8]:
#geneset gene edges
geneset_gene_edges = list(zip(list(gene_sets['gene_set_name']),list(gene_sets['gene_name'])))
geneset_gene_edges[:10]
[8]:
[('leuko_transendothelial-migration', 'ESAM'),
 ('leuko_transendothelial-migration', 'MYLPF'),
 ('leuko_transendothelial-migration', 'CTNNA2'),
 ('leuko_transendothelial-migration', 'MYL12A'),
 ('leuko_transendothelial-migration', 'TXK'),
 ('leuko_transendothelial-migration', 'CLDN8'),
 ('leuko_transendothelial-migration', 'PTK2B'),
 ('leuko_transendothelial-migration', 'MMP9'),
 ('leuko_transendothelial-migration', 'PTPN11'),
 ('leuko_transendothelial-migration', 'CLDN9')]
[9]:
#here we load the metadata from a data frame, this contains the gene set name, version, cell type, author and information about licensing
metadata = pd.read_csv(metadata_path,index_col='gene_set_name')
metadata.head()
[9]:
version_id cell_type_name annotation_name author license license_link license_type gene_set_type gene_set_topic Column1 Column2
gene_set_name
leuko_transendothelial-migration v_hs217 leukocyte cellular_process GO Term 2019-01-01: 10.5281/zenodo.2529950_210... Creative Commons Attribution 4.0 Unported Lic... http://geneontology.org/docs/go-citation-policy/ c manual_external immune_function NaN NaN
CD4-T_TH22_UP v_hs65b CD4-T cellular_process wallet MIT https://github.com/wallet-maker/cytopus/blob/m... c manual_internal immune_function NaN NaN
all_autophagic-cell-death v_hs220 all-cells cellular_process GO Term 2019-01-01: 10.5281/zenodo.2529950_210... Creative Commons Attribution 4.0 Unported Lic... http://geneontology.org/docs/go-citation-policy/ c manual_external cell_death_autophagy NaN NaN
all_n-glycan_degradation v_hs160 all-cells cellular_process 10.1038/ncomms13041 NaN NaN c manual_external metabolism NaN NaN
T_central-memory_UP v_hs78b TCM cellular_identity wallet MIT https://github.com/wallet-maker/cytopus/blob/m... c manual_internal immune_identity NaN NaN
[10]:
#we format the data identically for the geneset celltype edges
geneset_celltype_edges = list(zip(list(metadata.index),list(metadata['cell_type_name'])))
geneset_celltype_edges[:10]
[10]:
[('leuko_transendothelial-migration', 'leukocyte'),
 ('CD4-T_TH22_UP', 'CD4-T'),
 ('all_autophagic-cell-death', 'all-cells'),
 ('all_n-glycan_degradation', 'all-cells'),
 ('T_central-memory_UP', 'TCM'),
 ('all_actin-cytoskeleton_regulation', 'all-cells'),
 ('lung-epi_IL13_response', 'lung-epi'),
 ('all_mitophagy', 'all-cells'),
 ('T_CD4-T_UP', 'CD4-T'),
 ('all_steroid_metabolism', 'all-cells')]

We also need to classify our gene sets into cellular processes (gene programs) or identities (cell type markers)

[11]:
#provide a dictionary indicating the gene set annotation names
annotation_dict = metadata['annotation_name'].to_dict()
annotation_dict
[11]:
{'leuko_transendothelial-migration': 'cellular_process',
 'CD4-T_TH22_UP': 'cellular_process',
 'all_autophagic-cell-death': 'cellular_process',
 'all_n-glycan_degradation': 'cellular_process',
 'T_central-memory_UP': 'cellular_identity',
 'all_actin-cytoskeleton_regulation': 'cellular_process',
 'lung-epi_IL13_response': 'cellular_process',
 'all_mitophagy': 'cellular_process',
 'T_CD4-T_UP': 'cellular_identity',
 'all_steroid_metabolism': 'cellular_process',
 'all_MHC-I-presentation': 'cellular_process',
 'all_DNA_synthesis': 'cellular_process',
 'M_cDC2_UP': 'cellular_identity',
 'all_ketone-body_metabolism': 'cellular_process',
 'all_fatty-acid-beta-oxidation-peroxisomal': 'cellular_process',
 'all_MHC-II-presentation': 'cellular_process',
 'all_riboflavin_metabolism': 'cellular_process',
 'all_fatty-acid-metabolism': 'cellular_process',
 'B_memory-non-switched_UP': 'cellular_identity',
 'all_ubiquinone_synthesis': 'cellular_process',
 'all_galactose_metabolism': 'cellular_process',
 'all_inositol-phosphate_metabolism': 'cellular_process',
 'all_DNA-methylation': 'cellular_process',
 'lung-epi_IL4_response': 'cellular_process',
 'leuko_ROS_production': 'cellular_process',
 'B_pb-mature_UP': 'cellular_identity',
 'all_hyaluronan_metabolism': 'cellular_process',
 'all_hedgehog_signaling': 'cellular_process',
 'fibro_IL4_response': 'cellular_process',
 'B_plasma_UP': 'cellular_identity',
 'all_porphyrine-heme_metabolism': 'cellular_process',
 'mast_mast-cell-granule-effectors': 'cellular_process',
 'all_glycogenesis': 'cellular_process',
 'B_pb-t1_UP': 'cellular_identity',
 'ILC_CD56bright-NK_UP': 'cellular_identity',
 'endo_lung-venous_UP': 'cellular_identity',
 'all_mitophagy_regulation_positive': 'cellular_process',
 'all_autophagy-of-mitochondria': 'cellular_process',
 'Mac_CSF1_response': 'cellular_process',
 'all_glycerophospholipid_metabolism': 'cellular_process',
 'all_taurine-hypotaurine_metabolism': 'cellular_process',
 'CD8-T_progenitor-exhaustion_UP': 'cellular_process',
 'endo_aerocyte_UP': 'cellular_identity',
 'ILC_ILC3-NCRneg_UP': 'cellular_identity',
 'B_effector-2_UP': 'cellular_process',
 'all_osmotic-stress-response': 'cellular_process',
 'endo_VEGFC_response': 'cellular_process',
 'all_fatty-acid_synthesis': 'cellular_process',
 'TNK_PD-1_signaling': 'cellular_process',
 'all_transmembrane-transport-cellmembrane': 'cellular_process',
 'fibro_BMP4_response': 'cellular_process',
 'all_nucleotide_metabolism': 'cellular_process',
 'TNK_cytotoxicity-effectors': 'cellular_process',
 'all_GLU_metabolism': 'cellular_process',
 'all_CYS_metabolism': 'cellular_process',
 'Mac_IL4-IL13_response': 'cellular_process',
 'all_xenobiotics_metabolism': 'cellular_process',
 'all_coagulation-factor_production': 'cellular_process',
 'all_JAK-STAT_signaling': 'cellular_process',
 'all_HIS_metabolism': 'cellular_process',
 'B_memory_UP': 'cellular_identity',
 'CD4-T_IL4_response': 'cellular_process',
 'all_PHE_metabolism': 'cellular_process',
 'all_autophagy-nucleus': 'cellular_process',
 'all_heparan-sulfate_degradation': 'cellular_process',
 'CD4-T_TH9_UP': 'cellular_process',
 'all_ros_response': 'cellular_process',
 'all_cytosolic-DNA-sensing_signaling': 'cellular_process',
 'all_autophagy-selective': 'cellular_process',
 'B_memory-double-negative_UP': 'cellular_identity',
 'M_angiogenic-effectors': 'cellular_process',
 'all_cholesterol-homeostasis': 'cellular_process',
 'all_IL6-JAK-STAT3_signaling': 'cellular_process',
 'all_NAD_metabolism': 'cellular_process',
 'all_purine_synthesis': 'cellular_process',
 'CD4-T_TH2_UP': 'cellular_process',
 'all_ALA-ASP_metabolism': 'cellular_process',
 'all_fatty-acid-beta-oxidation-mitochondrial': 'cellular_process',
 'all_glyoxylate-dicarboxylate_metabolism': 'cellular_process',
 'all_folate_metabolism': 'cellular_process',
 'lung-smooth-muscle_TGFB1_response': 'cellular_process',
 'all_hypoxia-response': 'cellular_process',
 'all_n-glycan_synthesis': 'cellular_process',
 'ILC_adaptive-NK_UP': 'cellular_identity',
 'M_IL17A_response': 'cellular_process',
 'all_transmembrane-transport-mitochondrial': 'cellular_process',
 'all_lipophagy': 'cellular_process',
 'T_gdT_UP': 'cellular_identity',
 'endo_capillary_UP': 'cellular_identity',
 'all_ascorbate-uptake': 'cellular_process',
 'all_autophagy_regulation_positive': 'cellular_process',
 'M_mac_CSF1_response': 'cellular_process',
 'B_pb-t2_UP': 'cellular_identity',
 'all_chondroitine-and-heparan-sulfate_synthesis': 'cellular_process',
 'all_unfolded-protein-response': 'cellular_process',
 'CD4-T_TFH_UP': 'cellular_identity',
 'all_thiamin_metabolism': 'cellular_process',
 'all_circadian-rhythm': 'cellular_process',
 'all_pyruvate_metabolism': 'cellular_process',
 'all_type-I-ifn-response': 'cellular_process',
 'lung-epi_IL17A-IL22_response': 'cellular_process',
 'B_Breg_UP': 'cellular_process',
 'fibro_IL1B_response': 'cellular_process',
 'all_ARG-PRO_metabolism': 'cellular_process',
 'all_pyrimidine_metabolism': 'cellular_process',
 'all_pyrimidine_synthesis': 'cellular_process',
 'all_GABA-shunt': 'cellular_process',
 'all_GPI-anchor_synthesis': 'cellular_process',
 'CD4-T_Treg_UP': 'cellular_identity',
 'NSCLC-carcinoma-cell_TGFB1_response': 'cellular_process',
 'CD4-T_TH17_UP': 'cellular_process',
 'T_tissue-resident-memory_UP': 'cellular_identity',
 'all_protein-degradation-proteasome': 'cellular_process',
 'all_TRP_metabolism': 'cellular_process',
 'all_triacylglycerol_synthesis': 'cellular_process',
 'CD4-T_IL12_response': 'cellular_process',
 'all_bile-acid_synthesis': 'cellular_process',
 'all_transmembrane-transport-lysosome': 'cellular_process',
 'all_purine_metabolism': 'cellular_process',
 'CD8-T_tumor-reactive-like_UP': 'cellular_process',
 'CD8-T_terminal-exhaustion': 'cellular_process',
 'all_histone-methylation': 'cellular_process',
 'all_reticulophagy': 'cellular_process',
 'M_Langerhans_UP': 'cellular_identity',
 'all_phosphoinositide_signaling': 'cellular_process',
 'all_cyclic-nucleotide_metabolism': 'cellular_process',
 'B_effector-1_UP': 'cellular_process',
 'all_urea-cycle': 'cellular_process',
 'all_selenoamino-acid_metabolism': 'cellular_process',
 'all_peroxisome-component': 'cellular_process',
 'CD8-T_IL12_response': 'cellular_process',
 'all_chondroitine-sulfate_degradation': 'cellular_process',
 'all_iron-uptake-and-storage': 'cellular_process',
 'all_NOD-like-receptor_signaling': 'cellular_process',
 'all_thrombolysis-factor_production': 'cellular_process',
 'T_stem-cell-memory_UP': 'cellular_identity',
 'ILC_NK_UP': 'cellular_identity',
 'all_wnt-beta-catenin-signaling': 'cellular_process',
 'all_keratan-sulfate_synthesis': 'cellular_process',
 'all_TGFb_response': 'cellular_process',
 'all_TYR_metabolism': 'cellular_process',
 'neutro_CXCL8_response': 'cellular_process',
 'T_IL21_response': 'cellular_process',
 'all_pentose-phosphate-pathway': 'cellular_process',
 'B_memory-IgM-MZ_UP': 'cellular_identity',
 'all_CoA_synthesis': 'cellular_process',
 'B_IgM-ligation_response': 'cellular_process',
 'all_glycerin-SER-THR_metabolism': 'cellular_process',
 'all_type-II-ifn-response': 'cellular_process',
 'all_cholesterol_metabolism': 'cellular_process',
 'all_glycogenolysis': 'cellular_process',
 'all_autophagy-chaperone-mediated': 'cellular_process',
 'all_DNA-demethylation': 'cellular_process',
 'endo_lymphatic_UP': 'cellular_identity',
 'all_multidrug-resistance': 'cellular_process',
 'all_MYC_targets': 'cellular_process',
 'B_plasma-blast_UP': 'cellular_identity',
 'T_CD8-T_UP': 'cellular_identity',
 'all_TLR_signaling': 'cellular_process',
 'B_UP': 'cellular_identity',
 'all_NOTCH_signaling': 'cellular_process',
 'all_polyamines_metabolism': 'cellular_process',
 'endo_systemic-venous_UP': 'cellular_identity',
 'all_apoptosis': 'cellular_process',
 'all_carnitine-shuttle': 'cellular_process',
 'M_macrophage_UP': 'cellular_identity',
 'M_moDC_UP': 'cellular_identity',
 'all_ROS-detoxification': 'cellular_process',
 'all_DNA-repair': 'cellular_process',
 'all_lactate_production': 'cellular_process',
 'ILC_ILC2_UP': 'cellular_identity',
 'NK_IL15_response': 'cellular_process',
 'ILC_CD56dim-NK_UP': 'cellular_identity',
 'M_p-DC_UP': 'cellular_identity',
 'all_retinol_metabolism': 'cellular_process',
 'all_creatinine_metabolism': 'cellular_process',
 'all_autophagy-peroxisome': 'cellular_process',
 'B_memory-switched_UP': 'cellular_identity',
 'TNK_IL2_response': 'cellular_process',
 'all_biotin_metabolism': 'cellular_process',
 'M_monocyte_UP': 'cellular_identity',
 'T_UP': 'cellular_identity',
 'all_oxidative-phosphorylation': 'cellular_process',
 'M_mast-cell_UP': 'cellular_identity',
 'all_microautophagy-lysosomal': 'cellular_process',
 'CD8-T_KLRG1neg-effector_UP': 'cellular_identity',
 'all_Beta-Ala_metabolism': 'cellular_process',
 'all_SASP': 'cellular_process',
 'all_PI3K-AKT-mTOR_signaling': 'cellular_process',
 'all_p53-signaling': 'cellular_process',
 'all_pterin_synthesis': 'cellular_process',
 'endo_arterial_UP': 'cellular_identity',
 'M_granulocyte_UP': 'cellular_identity',
 'all_TCA-cycle': 'cellular_process',
 'all_glycolysis': 'cellular_process',
 'all_MET_metabolism': 'cellular_process',
 'all_macroautophagy': 'cellular_process',
 'all_RIG-I-like-receptor_signaling': 'cellular_process',
 'all_nucleophagy-late': 'cellular_process',
 'T_naive_UP': 'cellular_identity',
 'M_FDC_UP': 'cellular_identity',
 'all_TNF-via-NFkB_signaling': 'cellular_process',
 'mast_granule-exocytosis': 'cellular_process',
 'all_posttranslation-modification': 'cellular_process',
 'lung-epi_TGFB1_response': 'cellular_process',
 'M_cDC1_UP': 'cellular_identity',
 'all_complement_production': 'cellular_process',
 'Treg_FoxP3-stabilization': 'cellular_process',
 'T_IL4_response': 'cellular_process',
 'all_G1S-transition': 'cellular_process',
 'all_G2M-transition': 'cellular_process',
 'B_naive_UP': 'cellular_identity',
 'all_amino-sugar-nucleotide-sugar_metabolism': 'cellular_process',
 'fibro_EGF_response': 'cellular_process',
 'all_mitotic-spindle-component': 'cellular_process',
 'all_o-glycan_synthesis': 'cellular_process',
 'all_fructose-mannose_metabolism': 'cellular_process',
 'fibro_IL13_response': 'cellular_process',
 'all_platelet-activation-factor_production': 'cellular_process',
 'ILC_ILC1_UP': 'cellular_identity',
 'TNK_IL2-STAT5-signaling': 'cellular_process',
 'all_mTORC1_signaling': 'cellular_process',
 'CD8-T_KLRG1pos-effector_UP': 'cellular_identity',
 'all_eicosanoid_metabolism': 'cellular_process',
 'T_tcr-activation': 'cellular_process',
 'all_CYP_metabolism': 'cellular_process',
 'B_pb-t3_UP': 'cellular_identity',
 'fibro_TGFB1_response': 'cellular_process',
 'all_propanoate_metabolism': 'cellular_process',
 'all_glutathione_metabolism': 'cellular_process',
 'all_citric-acid-cycle': 'cellular_process',
 'B_germinal-center_UP': 'cellular_identity',
 'all_sphingolipid_metabolism': 'cellular_process',
 'fibro_IGF1_response': 'cellular_process',
 'all_ethanol_metabolism': 'cellular_process',
 'all_transmembrane-transport-ER': 'cellular_process',
 'M_cDC3_UP': 'cellular_identity',
 'T_effector-memory_UP': 'cellular_identity',
 'p-DC_CpG-TLR9_response': 'cellular_process',
 'DC_LPS_response': 'cellular_process',
 'all_VAL-LEU-ILE_metabolism': 'cellular_process',
 'all_pyroptosis': 'cellular_process',
 'all_autophagy-of-mitochondria_regulation_positive': 'cellular_process',
 'ILC_lymphoid-tissue-inducer_UP': 'cellular_identity',
 'all_macroautophagy_regulation_positive': 'cellular_process',
 'Mac_LPS_response': 'cellular_process',
 'ILC_ILC3-NCRpos_UP': 'cellular_identity',
 'DC_antigen-crosspresentation': 'cellular_process',
 'all_LYS_metabolism': 'cellular_process',
 'all_keratan-sulfate_degradation': 'cellular_process',
 'all_exocytosis': 'cellular_process',
 'endo_VEGFA_response': 'cellular_process',
 'Mac_IFNG_response': 'cellular_process',
 'all_transmembrane-transport-golgi': 'cellular_process'}

You can add additional metadata to the gene sets (optional). Provide this data as a nested dictionary of the format:

{‘gene_set_1’:{‘attribute_type_1’:’attribute_a’, ‘attribute_type_2’:’attribute_b’,….}…}

[12]:
metadata_columns = ['version_id', 'author', 'license',
       'license_link', 'license_type', 'gene_set_type', 'gene_set_topic']
metadata_dict = metadata[metadata_columns].to_dict('index')

construct the KnowledgeBase

[15]:
#if you want to save the data set save to True and provide the saving location as save_path
import cytopus as cp
G = cp.create.construct_kb(celltype_edges, geneset_gene_edges,geneset_celltype_edges,annotation_dict,metadata_dict=metadata_dict,save=True, save_path=DATA_DIR+'Cytopus_1.31nc.txt'
)
G
all cell types in gene set are contained in the cell type hierarchy
Pickled and saved to: /home/wallet/Downloads/1.31nc/Cytopus_1.31nc.txt
KnowledgeBase object containing 92 cell types and 201 cellular processes

[15]:
<cytopus.knowledge_base.kb_queries.KnowledgeBase at 0x7faeac767d90>
[16]:
#plot the cell type hierarchy
G.plot_celltypes()
all celltypes in knowledge base: ['CD8-T_terminal-exhaustion', 'colon-epi', 'crc-carcinoma-cell', 'ILC1', 'CD8-T_KLRG1pos-effector', 'NK', 'endo-lymphatic', 'CD4-T', 'CD8-T', 'endo', 'CD56bright-NK', 'CD4-TCM', 'cDC3', 'endo-arterial', 'B-pb-t2', 'CD8-Teffector', 'Langerhans', 'iNKT', 'M', 'FDC', 'CD8-TSCM', 'T', 'cDC', 'CD8-TRM', 'TRM', 'NK-adaptive', 'TFH', 'MAIT', 'capillary', 'ILC2', 'T-naive', 'endo-aerocyte', 'mono', 'B', 'DC', 'baso', 'gdT', 'ILC', 'TSCM', 'B-pb-mature', 'CD4-TSCM', 'MDC', 'ILC3-NCRneg', 'lung-smooth-muscle', 'mo-DC', 'gran', 'CD4-Teffector', 'TEM', 'neutro', 'B-memory-IgM-MZ', 'Lti', 'p-DC', 'mast', 'all-cells', 'B-naive', 'cDC1', 'CD56dim-NK', 'epi', 'CD8-T-progenitor-exhausted', 'cDC2', 'B-pb-t1', 'carcinoma-cell', 'TCM', 'TNK', 'ILC3-NCRpos', 'nc-mono', 'plasma-blast', 'leukocyte', 'c-mono', 'Mac', 'smooth-muscle', 'CD4-TRM', 'CD8-TEM', 'GC-B', 'CD8-T_KLRG1neg-effector', 'ILC3', 'lung-endo-venous', 'B-memory', 'eosino', 'B-memory-non-switched', 'fibro', 'B-memory-switched', 'CD4-TEM', 'NSCLC-carcinoma-cell', 'Treg', 'CD8-TCM', 'abT', 'endo-systemic-venous', 'B-pb-t3', 'B-memory-DN', 'plasma', 'lung-epi']
../_images/tutorials_02_creating_knowledge_base_20_1.png