Source code for cytopus.tl.label

#import pandas as pd
#import csv

[docs] def overlap_coefficient(set_a,set_b): ''' calculate the overlap coefficient between two sets ''' min_len = min([len(set_a),len(set_b)]) intersect_len = len(set_a.intersection(set_b)) overlap = intersect_len/min_len return overlap
[docs] def label_marker_genes(marker_genes, gs_label_dict, threshold = 0.4): ''' label an array of marker genes using a KnowledgeBase or a dictionary derived from the KnowledgeBase returns a dataframe of overlap coefficients for each gene set annotation and marker gene marker_genes: numpy.array or list of lists, factors x marker genes gs_label_dict: cytopus.KnowledgeBase or dict, with gene set names (str) as keys and gene sets (list) as values threshold: float, if overlap coefficient > than threshold the factor will be labeled with the gene set name with maximum overlap coefficient returns: pandas.DataFrame, with overlap coefficients of factors (rows) and gene sets (columns), indices are relabeled to the gene set with the maximum overlap coefficient ''' #import numpy as np if isinstance(gs_label_dict,KnowledgeBase): #collapse annotation dict gs_dict = {} key_list = [] for key, value in gs_label_dict.celltype_process_dict.items(): for k,v in value.items(): if k not in key_list: gs_dict[k]=v key_list.append(k) elif isinstance(gs_label_dict, dict): for v in gs_label_dict.values(): if isinstance(v,dict): raise ValueError('gs_label_dict is a nested dictionary. gs_label_dict must be a flat/non-nested dictionary with gene set names as keys (str) amd gene sets (lists of strings) as values') gs_dict = gs_label_dict else: raise ValueError('gs_label_dict must be a dictionary or a cytopus.kb.queries.KnowledgeBase object') overlap_df = pd.DataFrame() for i, v in pd.DataFrame(marker_genes).T.items(): overlap_temp = [] gs_names_temp = [] for gs_name, gs in gs_dict.items(): gene_set = set(gs) marker_set = set(v) #check and remove for nans if 'nan' in gene_set: gene_set.remove('nan') if 'nan' in marker_set: marker_set.remove('nan') if len(gene_set) > 0 and len(marker_set)>0: overlap_temp.append(overlap_coefficient(set(gene_set),set(marker_set))) else: overlap_temp.append(np.nan) gs_names_temp.append(gs_name) overlap_df_temp = pd.DataFrame(overlap_temp, columns=[i],index=gs_names_temp).T overlap_df = pd.concat([overlap_df,overlap_df_temp]) marker_gene_labels = [] #gene sets for marker_set in overlap_df.index: max_overlap = overlap_df.loc[marker_set].sort_values().index[-1] if overlap_df.loc[marker_set].sort_values().values[-1] >threshold: marker_gene_labels.append(max_overlap) else: marker_gene_labels.append(marker_set) overlap_df.index = marker_gene_labels return overlap_df
[docs] def get_celltype(adata, celltype_key,factor_list=None,Spectra_cell_scores= 'SPECTRA_cell_scores'): ''' For a list of factors check in which cell types they are expressed adata: anndata.AnnData, containing cell type labels in adata.obs[celltype_key] celltype_key: str, key for adata.obs containing the cell type labels factor_list: list, list of keys for factor loadings in .obs, if none use factor loadings in adata.obsm['SPECTRA_factors'] return: dictionary mapping factor names and celltypes Spectra_cell_scores: str, key for Spectra cell scores in adata.obsm ''' if factor_list!= None: factors= adata.obs[factor_list] factors['celltype'] = list(adata.obs[celltype_key]) else: factors = pd.DataFrame(adata.obsm[Spectra_cell_scores]) factors['celltype'] = list(adata.obs[celltype_key]) #create factor:celltype dict grouped_df = factors.groupby('celltype').mean() #get factor names for global (expressed in all cells) and cell type spec factors global_factor_names = grouped_df.T[(grouped_df!=0).all()].index specific_factor_names= [x for x in grouped_df.columns if x not in global_factor_names] #add global factors to dict factor_names_global = {x:'global' for x in global_factor_names} #get celltype for celltype spec factors grouped_df_spec = grouped_df[specific_factor_names] for i in grouped_df_spec.columns: factor_names_global[i] = grouped_df_spec[i].sort_values(ascending=False).index[0] return factor_names_global
[docs] def get_gmt(gs_dict,save=False,path=None): ''' transform a dictionary into a .gmt file gs_dict: dict, gene set dictionary with format {'gene set name':['Gene_a','Gene_b','Gene_c',...]} save: bool, if True saves .gmt file to path path: str, path to save .gmt file ''' #import numpy as np #import pandas as pd #retrieve all genes from dict genes = [] for k,v in gs_dict.items(): genes = genes+v genes = list(set(genes)) #pad the lists in gs_dict to equal lengths max_length = max(map(len, gs_dict.values())) for k,v in gs_dict.items(): if len(v)<max_length: gs_dict[k]+= [np.nan]*(max_length-len(v)) #transform into df gs_df = pd.DataFrame(gs_dict).T if save: gs_df.to_csv(path,sep='\t',header=False) print('print saving to:',path) else: return gs_df
[docs] def flatten_hierarchical_dict(d, parent_key=None): items = [] for k, v in d.items(): if parent_key is not None: items.append((parent_key, k)) if isinstance(v, dict): items.extend(flatten_hierarchical_dict(v, k)) return items
[docs] def hierarchy_to_csv(hierarchy,filename='hierarchy.csv',header_name=['Parent','Child']): ''' get hierarchy from knowledge base and write to .csv hierarchy : dict, nested dict containing cell type hierarchy e.g. G.get_celltype_hierarchy() filename : str, output file name to write csv to header_name : ls, header name of the csv ''' flat_list = flatten_hierarchical_dict(hierarchy) # Write to CSV with open(filename, 'w', newline='') as file: writer = csv.writer(file) writer.writerow(header_name) for parent, child in flat_list: writer.writerow([parent, child])
[docs] def geneset_to_csv(gs_dict, filename='geneset.csv', header_name=['gene_set_name','gene_name']): ''' get gene sets from knowledge base and write to .csv gs_dict : dict, gene set dictionary e.g. G.processes header_name : ls, name of header in .csv file filename : str, output file name to write csv to ''' with open(filename, 'w', newline='') as file: writer = csv.writer(file) writer.writerow(header_name) for key, values in gs_dict.items(): for value in values: writer.writerow([key, value])
#import networkx as nx #import pandas as pd
[docs] def metadata_to_csv(graph, file_name, specific_class = False, class_value=None): ''' get metadata and write to csv graph : networkx.DiGraph, graph containing nodes with attributes file_name : str, path to write csv to specific_class : str, restrict to nodes with specific 'class' attribute class_value : str, class attribute to restrict to ''' # Filter nodes by 'class' attribute value if specific_class: attributes = {node: data for node, data in graph.nodes(data=True) if data.get('class') == class_value} else: attributes = attributes = {node: data for node, data in graph.nodes(data=True)} # Create a DataFrame from the filtered dictionary df = pd.DataFrame.from_dict(attributes, orient='index') # Saving to CSV df.to_csv(file_name)