General aspects of the pipeline#

%load_ext autoreload

import os 
import sys
import time
import logging
import re
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from natsort import natsorted


sys.path.append(os.path.dirname(os.getcwd()))
from gridgene import get_arrays as ga
from gridgene import contours 
from gridgene import get_masks

sys.path.append(os.path.dirname(os.getcwd()))
from gridgene import get_arrays as ga
from gridgene import contours, get_masks 
from gridgene.mask_properties import MaskAnalysisPipeline, MaskDefinition
from gridgene.binsom import GetBins, GetContour

define the logger : can be None, and is set to INFO

# define the logger :  can be None, and is set to INFO
# Custom logger setup
logger = logging.getLogger('contour_logger')
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

Get the files

xenium_path =  '../../xenium_data/HLA/GD_TMA1_S3/fov_filtered'

to_exclude = [
    'TMA1_Selection14_filtered.csv' , # little tumour
    'TMA1_Selection15_filtered.csv', # tonsil
    'TMA1_Selection18_filtered.csv' , # normal
    'TMA1_Selection24_filtered.csv', # tonsl 
    'TMA1_Selection27_filtered.csv', # low quality
    'TMA1_Selection32_filtered.csv', # low quality
    'TMA1_Selection33_filtered.csv', # low quality
             ]
files_tma1 = os.listdir(xenium_path)
files = [os.path.join(xenium_path, file) for file in files_tma1 if file not in to_exclude]
print(len(files))

Define Cancer Stroma based on density of genes#

This intends to based on defined genes to find tumour areas and empty areas. The stroma/other is defined as the area that is not empty and not tumour.

We need to define:

Parameters for tumour contours
- Genes to consider : Use the marker genes of the gene list with log fold changes for epithelial cells
- density
- minimum area
- kernel size
this will define for an overlapping area with “kernel size” * “kernel_size” a minimum number of “density” genes of interest. The contiguous area of the contour will have at least “minimum area”
Parameters for emptyness contours
- density
- minimum area
- kernel size this will define for an overlapping area with “kernel size” * “kernel_size” a maximum number of “density”/ counts of all genes. The contiguous area of the contour will have at least “minimum area”

# defined with the gene list f
target_tum = ['EPCAM', 'SMIM22','CLDN3', 'KRT18','LGALS4', 'KRT8', 'ELF3','TSPAN8', 'STMN1', 'CD47', 'MYC', 'LGALS3'] 

# param tum
density_th_tum = 25    # 50
min_area_th_tum =  500 #1000     check how much is a cell 
kernel_size_tum = 20   # 20

# param empty
density_th_empty = 20
min_area_th_empty = 50 #400
kernel_size_empty = 10

density_th_tum = 25    # 50
min_area_th_tum = 200
kernel_size_tum = 20   # 20

# param empty
density_th_empty = 20
min_area_th_empty = 400
kernel_size_empty = 10

grab data and derive arrays

file_csv = files[7]
df_total = pd.read_csv(file_csv)
df_total = df_total[['x_location', 'y_location', 'feature_name']]
df_total = df_total.rename(columns={'feature_name': 'target'})
df_total = df_total[~df_total['target'].str.contains('System|egative')]
df_total['X'] = df_total['x_location'] - min(df_total['x_location'])
df_total['Y'] = df_total['y_location'] - min(df_total['y_location'])

n_genes = len(df_total['target'].unique())
height = int(max(df_total['X'])) + 1
width = int(max(df_total['Y'])) + 1

print(f'n genes: {n_genes}')
print(f'shape: {height}, {width}')
print(f'n hits {len(df_total)}')

# this makes the sparse df to an array with the spatial information 
target_dict_total = {target: index for index, target in enumerate(df_total['target'].unique())}
array_total = ga.transform_df_to_array(df = df_total, target_dict=target_dict_total, array_shape = (height, width,len(target_dict_total))).astype(np.int8)

# creating subsets 
df_subset_tum, array_subset_tum, target_indices_subset_tum = ga.get_subset_arrays(df_total, array_total,target_dict_total,
                                                                     target_list=target_tum, target_col = 'target')

n genes: 480
shape: 1809, 1769
n hits 4433228

CTum = contours.ConvolutionContours(array_subset_tum, contour_name='tum')
CTum.get_conv_sum(kernel_size=kernel_size_tum, kernel_shape='square')
CTum.contours_from_sum(density_threshold = density_th_tum,
                       min_area_threshold = min_area_th_tum , directionality = 'higher')

CEmpty = contours.ConvolutionContours(array_total, contour_name='empty')
CEmpty.get_conv_sum(kernel_size=kernel_size_empty, kernel_shape='square')
CEmpty.contours_from_sum(density_threshold = density_th_empty,
                       min_area_threshold = min_area_th_empty, directionality = 'lower') # attention that directionality is lower here 

2025-06-20 15:16:41,914 - gridgen.contours.tum - INFO - Initialized GetContour
2025-06-20 15:16:41,996 - gridgen.contours.tum - INFO - get_conv_sum took 0.0809 seconds
2025-06-20 15:16:42,085 - gridgen.contours.tum - INFO - Number of contours after filtering no counts: 44
2025-06-20 15:16:42,085 - gridgen.contours.tum - INFO - contours_from_sum took 0.0889 seconds
2025-06-20 15:16:42,086 - gridgen.contours.empty - INFO - Initialized GetContour
2025-06-20 15:16:42,758 - gridgen.contours.empty - INFO - get_conv_sum took 0.6722 seconds
2025-06-20 15:16:43,406 - gridgen.contours.empty - INFO - Number of contours after filtering no counts: 20
2025-06-20 15:16:43,406 - gridgen.contours.empty - INFO - contours_from_sum took 0.6472 seconds

Plot contours

fig, axs = plt.subplots(1, 2, figsize=(15, 10))

CTum.plot_conv_sum(cmap='plasma', c_countour='white', ax=axs[0])
axs[0].set_title('Tum points and tum contours')

CEmpty.plot_conv_sum(cmap='plasma', c_countour='white', ax=axs[1])
axs[1].set_title('total points and contours for empty')

plt.show()


fig, axs = plt.subplots(1, 2, figsize=(15, 7))

CTum.plot_contours_scatter(path=None, show=False, s=0.05, alpha=0.5, linewidth=1,
                           c_points= 'blue',c_contours= 'red', ax=axs[0])
axs[0].set_title('Tum points and tum contours')  

CEmpty.plot_contours_scatter(path=None, show=False, s=0.05, alpha=0.5, linewidth=1,
                               c_points= 'blue',c_contours= 'red', ax=axs[1])
axs[1].set_title('total points and contours for empty')  

plt.show()

../_images/c20a1afa67a56f2e4e5dbcbf55fc4e1c0ac34759281c03fdb31bde4098ab4970.png

../_images/b6438a4ce3f8fb6d0260d1f63159b229406a9e7082fc75751e9afb3972063893.png

Once you define your parameters of contours you can transform into masks,and obtain, in this case, masks for cancer, stroma and empty areas.
The stroma mask from this analysis is the stroma minus the cancer.

To obtain masks you need to have the contours.

#### obtain masks
GM = get_masks.GetMasks(image_shape = (height, width))

mask_empty = GM.create_mask(CEmpty.contours)
mask_tum = GM.create_mask(CTum.contours)
mask_tum = GM.fill_holes(mask_tum)
mask_stroma = GM.subtract_masks(np.ones((height, width), dtype=np.uint8), mask_tum, mask_empty)          
mask_stroma = GM.filter_binary_mask_by_area(mask_stroma, min_area=700)


GM.plot_masks(masks=[mask_stroma, mask_tum], mask_names=['Stroma', 'Cancer'],
              background_color=(1, 1, 1),
              mask_colors={'Stroma': (65, 105, 225), 'Cancer': (255, 165, 0)},
              figsize = (5,5),
              path=None, show=True, ax=None)

2025-06-20 15:16:45,489 - gridgen.get_masks.GetMasks - INFO - Initialized GetMasks
2025-06-20 15:16:45,516 - gridgen.get_masks.GetMasks - INFO - Subtracted masks from base mask.

../_images/2683acbcbccbd2c4de01cdae151d28db4af2295b910c6e16dfaff58b1ccdc33f.png

From these we can expand a mask of interest and check their surroundings. In this case we will expand the cancer mask and obtain the interface border masks.

the expansion is given in pixels, so be aware, as in xenium the resolution is approx 1px = 1um so, expansions of 5,10,15,20 would be 5,10,15,20.

TA = get_masks.ConstrainedMaskExpansion(mask_tum, mask_stroma)
TA.expand_mask(expansion_pixels=[10,20], min_area=1000)

# change names and colos«rs! 
mask_colors = {'expansion_10':(0, 165, 0),
               'expansion_20':(100, 200, 0),
               'seed_mask':(255, 165, 0),
               'constraint_remaining':(65, 105, 225) } # Strom}

GM.plot_masks(masks=TA.binary_expansions.values(),
              mask_names=TA.binary_expansions.keys(),
              background_color=(1, 1, 1), mask_colors=mask_colors, path=None, show=True, ax=None,
             figsize=(8, 6))

2025-06-20 15:16:45,792 - gridgen.get_masks.GetMasks - INFO - Initialized GetMasks

../_images/5873de77a1f3edc1e255508b02d3a7416ce7c52fb7bfdfd89e53d3f6fae752a0.png

Finally we can extract the information from the masks

We have several alternatives here. We can run this analysis: * per object in each mask * per mask, where all the objects of each mask are agglomerated. * per grid. divides each object in a grid size and extracts the information based on this grid. This means the counts are based on objects with same size.

You can get:

* Counts
* Morphological characteristics 

Per mask. run analysis type = ‘bulk’

# 1. Define your masks using MaskDefinition
mask_definitions = [
    MaskDefinition(mask=TA.binary_expansions['constraint_remaining'], mask_name='Stroma_remaining', analysis_type='bulk'),
    MaskDefinition(mask=TA.binary_expansions['seed_mask'], mask_name='Cancer', analysis_type='bulk'),
    MaskDefinition(mask=TA.binary_expansions['expansion_10'], mask_name='CI_10um', analysis_type='bulk'),
    MaskDefinition(mask=TA.binary_expansions['expansion_20'], mask_name='CI_20um', analysis_type='bulk'),
]

# 2. Initialize and run the analysis pipeline
pipeline = MaskAnalysisPipeline(mask_definitions=mask_definitions, 
                                array_counts = array_total,
                        target_dict = target_dict_total,
  )
results = pipeline.run()
df = pipeline.get_results_df()
df

2025-06-20 15:16:46,665 - gridgen.mask_properties.GetMasks - INFO - Initialized MaskAnalysisPipeline with 4 masks.
2025-06-20 15:16:46,666 - gridgen.mask_properties.GetMasks - INFO - Processing mask: Stroma_remaining (bulk)
2025-06-20 15:16:47,099 - gridgen.mask_properties.GetMasks - INFO - Processing mask: Cancer (bulk)
2025-06-20 15:16:47,943 - gridgen.mask_properties.GetMasks - INFO - Processing mask: CI_10um (bulk)
2025-06-20 15:16:48,341 - gridgen.mask_properties.GetMasks - INFO - Processing mask: CI_20um (bulk)

run took 2.0489 seconds

	area	object_id	SPARC	VIM	KRT18	EPCAM	COL4A2	CEACAM1	KRT8	FCER1G	...	TNFRSF18	COL4A5	IL22RA2	IGHA2	CD45RA	IGHD	LST1	VPREB3	mask_name	analysis_type
0	313448	bulk	57272	12381	723	895	4292	151	445	1326	...	1	18	8	6	0	4	1	0	Stroma_remaining	bulk
1	1490024	bulk	58867	23990	183020	240049	7417	34765	59454	6946	...	7	31	64	13	3	11	8	5	Cancer	bulk
2	160338	bulk	28922	7273	1029	1025	3087	127	395	1330	...	1	4	9	0	1	1	1	0	CI_10um	bulk
3	118916	bulk	22367	5109	364	411	2041	66	174	812	...	1	3	7	3	0	0	2	0	CI_20um	bulk

4 rows × 484 columns

Per object in each mask, run analysis type = ‘object’

# 1. Define your masks using MaskDefinition
mask_definitions = [
    MaskDefinition(mask=TA.binary_expansions['constraint_remaining'], mask_name='Stroma_remaining', analysis_type='per_object'),
    MaskDefinition(mask=TA.binary_expansions['seed_mask'], mask_name='Cancer', analysis_type='per_object'),
    MaskDefinition(mask=TA.binary_expansions['expansion_10'], mask_name='CI_10um', analysis_type='per_object'),
    MaskDefinition(mask=TA.binary_expansions['expansion_20'], mask_name='CI_20um', analysis_type='per_object'),
]

# 2. Initialize and run the analysis pipeline
pipeline = MaskAnalysisPipeline(mask_definitions=mask_definitions, 
                                array_counts = array_total,
                        target_dict = target_dict_total,
  )
results = pipeline.run()
df = pipeline.get_results_df()
df

2025-06-20 15:16:48,730 - gridgen.mask_properties.GetMasks - INFO - Initialized MaskAnalysisPipeline with 4 masks.
2025-06-20 15:16:48,731 - gridgen.mask_properties.GetMasks - INFO - Processing mask: Stroma_remaining (per_object)
2025-06-20 15:16:49,625 - gridgen.mask_properties.GetMasks - INFO - Processing mask: Cancer (per_object)
2025-06-20 15:16:51,203 - gridgen.mask_properties.GetMasks - INFO - Processing mask: CI_10um (per_object)
2025-06-20 15:16:51,987 - gridgen.mask_properties.GetMasks - INFO - Processing mask: CI_20um (per_object)

run took 3.8931 seconds

	object_id	area	perimeter	eccentricity	solidity	centroid_y	centroid_x	min_row	min_col	max_row	...	TNFRSF18	COL4A5	IL22RA2	IGHA2	CD45RA	IGHD	LST1	VPREB3	mask_name	analysis_type
0	1	1085.0	169.018290	0.922395	0.847656	85.949309	1004.500461	75	970	102	...	0	0	0	0	0	0	0	0	Stroma_remaining	per_object
1	2	5981.0	503.972655	0.860329	0.536028	178.083264	1118.782812	115	1065	231	...	0	0	0	1	0	0	0	0	Stroma_remaining	per_object
2	3	163.0	71.805087	0.969076	0.724444	205.773006	1228.509202	197	1213	222	...	0	0	0	0	0	0	0	0	Stroma_remaining	per_object
3	4	1.0	0.000000	0.000000	1.000000	222.000000	429.000000	222	429	223	...	0	0	0	0	0	0	0	0	Stroma_remaining	per_object
4	5	2068.0	257.806133	0.890013	0.704360	290.335106	439.647002	254	407	334	...	0	0	0	0	0	0	0	0	Stroma_remaining	per_object
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
91	9	4752.0	965.637698	0.973810	0.409549	1211.740109	239.376684	1137	137	1278	...	0	0	0	0	0	0	0	0	CI_20um	per_object
92	10	26114.0	5598.663669	0.933984	0.140772	1328.682699	1136.854254	1198	606	1544	...	1	1	0	1	0	0	0	0	CI_20um	per_object
93	11	3225.0	700.607214	0.954404	0.282771	1463.885891	375.604961	1392	304	1564	...	0	0	0	0	0	0	0	0	CI_20um	per_object
94	12	14234.0	3016.399421	0.969164	0.140111	1578.050232	866.546930	1476	553	1666	...	0	1	0	0	0	0	0	0	CI_20um	per_object
95	13	2270.0	509.173665	0.822037	0.271564	1635.681938	1169.970044	1603	1119	1696	...	0	0	1	0	0	0	0	0	CI_20um	per_object

96 rows × 493 columns

Grid in each mask, run analysis type = ‘grid’. You shoudl also give a grid size omm this case

# 1. Define your masks using MaskDefinition
mask_definitions = [
    MaskDefinition(mask=TA.binary_expansions['constraint_remaining'], mask_name='Stroma_remaining', analysis_type='grid', grid_size = 10),
    MaskDefinition(mask=TA.binary_expansions['seed_mask'], mask_name='Cancer', analysis_type='grid', grid_size = 10),
    MaskDefinition(mask=TA.binary_expansions['expansion_10'], mask_name='CI_10um', analysis_type='grid', grid_size = 10),
    MaskDefinition(mask=TA.binary_expansions['expansion_20'], mask_name='CI_20um', analysis_type='grid', grid_size = 10),
]

# 2. Initialize and run the analysis pipeline
pipeline = MaskAnalysisPipeline(mask_definitions=mask_definitions, 
                                array_counts = array_total,
                        target_dict = target_dict_total,
  )
results = pipeline.run()
df = pipeline.get_results_df()
df

2025-06-20 15:16:52,653 - gridgen.mask_properties.GetMasks - INFO - Initialized MaskAnalysisPipeline with 4 masks.
2025-06-20 15:16:52,654 - gridgen.mask_properties.GetMasks - INFO - Processing mask: Stroma_remaining (grid)
2025-06-20 15:16:54,628 - gridgen.mask_properties.GetMasks - INFO - Processing mask: Cancer (grid)
2025-06-20 15:17:01,002 - gridgen.mask_properties.GetMasks - INFO - Processing mask: CI_10um (grid)
2025-06-20 15:17:02,317 - gridgen.mask_properties.GetMasks - INFO - Processing mask: CI_20um (grid)

run took 10.6772 seconds

	x	y	object_id	area	centroid_x	centroid_y	SPARC	VIM	KRT18	EPCAM	...	TNFRSF18	COL4A5	IL22RA2	IGHA2	CD45RA	IGHD	LST1	VPREB3	mask_name	analysis_type
0	1020	100	1	39	975.000000	77.282051	2	0	0	0	...	0	0	0	0	0	0	0	0	Stroma_remaining	grid
1	1020	100	1	45	984.777778	77.222222	2	0	0	0	...	0	0	0	0	0	0	0	0	Stroma_remaining	grid
2	1020	100	1	40	994.525000	77.475000	2	0	0	0	...	0	0	0	0	0	0	0	0	Stroma_remaining	grid
3	1020	100	1	49	1004.408163	77.040816	2	0	0	0	...	0	0	0	0	0	0	0	0	Stroma_remaining	grid
4	1020	100	1	33	1014.000000	77.787879	2	0	0	0	...	0	0	0	0	0	0	0	0	Stroma_remaining	grid
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
25838	1140	1690	13	70	1133.414286	1685.457143	0	0	0	0	...	0	0	0	0	0	0	0	0	CI_20um	grid
25839	1140	1690	13	8	1141.125000	1688.375000	0	0	0	0	...	0	0	0	0	0	0	0	0	CI_20um	grid
25840	1140	1690	13	3	1128.666667	1690.333333	0	0	0	0	...	0	0	0	0	0	0	0	0	CI_20um	grid
25841	1140	1690	13	43	1134.581395	1691.720930	0	0	0	0	...	0	0	0	0	0	0	0	0	CI_20um	grid
25842	1140	1690	13	16	1141.875000	1691.125000	0	0	0	0	...	0	0	0	0	0	0	0	0	CI_20um	grid

25843 rows × 488 columns

df['area'].value_counts()

area
  16321
      323
      212
      199
      196
       ...  
      65
      64
      63
      60
      60
Name: count, Length: 100, dtype: int64

TA retrieves multiple mask dicts: * binary_expansions: with binary masks * labeled_expansions: with labelled object * referenced_objects: for hierarchy analysis

We can plot the masks simply just as with simple colors. There is also a function in GM to plot labelled masks with bounding boxes.

for mask in TA.labeled_expansions:
    labeled_mask = TA.labeled_expansions[mask]
    GM.plot_labeled_masks(label_mask=labeled_mask,
                       mask_name = mask, show=False)

../_images/c54bcd54ae390d4207e19b9682fcd435124463b46b54bd44238695f20cdf40bf.png

../_images/e4895a8cc8a2ea49fc7dcc69225d21d12cacfb28f8d6738f6af01d39b6dfad8b.png

../_images/1b482eb2536db3f8b864445def1b4fca1f34261e9cc7309cba9c31572d8da48f.png

../_images/2c33d71e0c34a6cb0f1bae5a3fa21ad621737f6bccc979a3182bfe1ccb97248c.png

you can now define the parameters that best suit the full cohort to define the contours and run for all files in the cohort.

Considerations:

probably set logger to write so a file with all the logging save to a report file ( get the file to a single core or do a single file for all the cohort is up to you) save the images and plots that may be necessary for the analysis later. Plotting and save takes time, but in some cases may be saving time to prevent you to redo analysis to obtain a figure.

How the same analysis look like using SOM.#

Bin the image and aply a SOM cluster Each image is analysed separately

# Extract the filename without extension
filename = os.path.basename(file_csv).split('.')[0]

# Extract the desired part
name_file = '_'.join(filename.split('_')[:2])

We will get the bins defined in bin size from a list of df. In this case just one df of an image

These bins are not overlapping

The counts of eachbin with all the space information are saved in an adata object.

Preprocess bin applies single cell normalization oipeline to the bin information ( normalize total and log1) and filters out empty bins <10 counts.

bin_size = 5  # 10
min_counts = 10 # 10
unique_targets = df_total['target'].unique()

GB = GetBins(bin_size, unique_targets, logger)
GB.get_bin_cohort(df_list= [df_total], df_name_list = [name_file], cohort_name = 'HLA')
GB.preprocess_bin(min_counts = min_counts)
adata = GB.adata

2025-06-20 15:31:54,159 - contour_logger - INFO - Time to get bins for 1 dataframes: 1.93 seconds
2025-06-20 15:31:54,159 - contour_logger - INFO - Number of bins: 99374
2025-06-20 15:31:54,159 - contour_logger - INFO - Number of genes: 480

Now that we have the bins of the images. We can apply a SOM cluster. we will apply a SOM map with 2 cells, and see if it accurately divides tumour from stroma due to their big differences.

GC = GetContour(adata, logger)
GC.run_som(som_shape = (2,1), n_iter = 5000, sigma=.5, learning_rate=.5, random_state = 42)

2025-06-20 15:32:22,804 - contour_logger - INFO - Time to run som on 78461 bins: 0.93
2025-06-20 15:32:22,805 - contour_logger - INFO - Number of clusters: 2
2025-06-20 15:32:22,806 - contour_logger - INFO - number of bins in each cluster: cluster_som
0    42722
1    35739
Name: count, dtype: int64

As one can see this is almost instaneously. We can eval the differentially gene expression. one can use the GC.adata object and run preferred analysis. You can also use GC.eval_som_statistical

GC.eval_som_statistical(top_n=10)

2025-06-20 15:32:27,736 - contour_logger - INFO - n top genes for group 0
2025-06-20 15:32:27,738 - contour_logger - INFO - 
     names      scores  logfoldchanges  pvals  pvals_adj group
  EPCAM  496.127167        4.710810    0.0        0.0     0
  KRT18  401.328308        4.123829    0.0        0.0     0
   LCN2  350.054199        4.303699    0.0        0.0     0
   KRT8  204.175812        3.766609    0.0        0.0     0
  CCL20  203.358932        3.733404    0.0        0.0     0
    CA9  167.450836        4.739976    0.0        0.0     0
  BIRC3  165.406113        3.378658    0.0        0.0     0
  HLA-G  163.212875        1.587320    0.0        0.0     0
 IFITM1  163.142197        2.525657    0.0        0.0     0
CEACAM1  161.894699        4.590243    0.0        0.0     0
2025-06-20 15:32:27,743 - contour_logger - INFO - n top genes for group 1
2025-06-20 15:32:27,745 - contour_logger - INFO - 
     names      scores  logfoldchanges  pvals  pvals_adj group
  SPARC  367.663055        4.587291    0.0        0.0     1
 COL1A1  347.223572        4.167460    0.0        0.0     1
 IGFBP7  191.690048        4.526878    0.0        0.0     1
    LUM  190.487732        4.909051    0.0        0.0     1
  TIMP2  187.690933        4.334179    0.0        0.0     1
 COL3A1  186.767471        5.427709    0.0        0.0     1
    VIM  157.729630        2.898454    0.0        0.0     1
COL11A1  147.477905        5.134010    0.0        0.0     1
 COL6A2  132.532669        4.741213    0.0        0.0     1
  TAGLN  131.269485        4.528151    0.0        0.0     1

From the genes it appears that is separating bins based n stroma/ tumour .

Let’s see graphicaly

we can see in two different ways. one is to transform the bins into an image again using GC.get_som_2d_image(bin_size = 10) and plot it with GC.plot_som. 

The other option is to use squidpy and the centroids information stored in the adata object. In this one only the centroids are plotted ( but all the bins have the same size) 

import matplotlib.colors as mcolors
import squidpy as sq

som_images = GC.get_som_2d_image(bin_size = 5)

# Create a custom colormap and legends 
cmap = mcolors.ListedColormap(['black', 'orange', 'blue',])
legend_labels = {0: 'Empty', 1: 'Cluster 0', 2: 'Cluster 1'}

for name, som_image in som_images.items():
    print(name)
    GC.plot_som(som_image, cmap = cmap, path=None, show=True, figsize=(5, 5), ax=None, legend_labels=legend_labels)

/home/martinha/miniconda3/envs/GRIDGEN/lib/python3.11/site-packages/dask/dataframe/__init__.py:31: FutureWarning: The legacy Dask DataFrame implementation is deprecated and will be removed in a future version. Set the configuration option `dataframe.query-planning` to `True` or None to enable the new Dask Dataframe implementation and silence this warning.
  warnings.warn(
/home/martinha/miniconda3/envs/GRIDGEN/lib/python3.11/site-packages/anndata/utils.py:434: FutureWarning: Importing read_text from `anndata` is deprecated. Import anndata.io.read_text instead.
  warnings.warn(msg, FutureWarning)

TMA1_Selection22

../_images/6ff117c6924e85ed71731365e75194998d9e0734b8ca639738d11c8f8346e6c5.png

# adata.obsm["spatial"] = adata.obs[["x_centroid", "y_centroid"]].copy().to_numpy()
unique_cases =adata.obs['name'].unique()
for case in unique_cases:
    # Filter the AnnData object for the current case
    adata_case = adata[adata.obs['name'] == case, :]
    print(case)
    plt.rcParams["figure.figsize"] = (5, 4)
    sq.pl.spatial_scatter(
            adata_case,
            library_id='name',
            shape=None,
            color=["cluster_som"],
            size=1,
    )
    plt.show()

TMA1_Selection22

/home/martinha/miniconda3/envs/GRIDGEN/lib/python3.11/site-packages/scanpy/plotting/_utils.py:487: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
  adata.uns[value_to_plot + "_colors"] = colors_list
/home/martinha/miniconda3/envs/GRIDGEN/lib/python3.11/site-packages/squidpy/pl/_spatial_utils.py:976: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  _cax = scatter(

../_images/176622b5bb2bd7abde4dcc5958e214050018c0e4d4c5ea4da33d82c20c87fb1f.png

We can use this information to build masks as we saw based on the contours defined by genes.

SOM clusters are numbered. one needs to identify the identity of these clusters. from GC.eval_som_statistical(top_n=10), we can see that cluster 0 is Stroma (SPARC, COL1A1, VIM … ) and 1 is tumour (EPCAM, KRT18, KRT8 …)

name, som_image = list(som_images.items())[0]
# pass the contour area threshold
GM = get_masks.GetMasks(image_shape = (som_image.shape[0], som_image.shape[1]))
mask_S = (som_image==2).astype(np.uint8) * 255
GM.mask_S = (GM.filter_binary_mask_by_area(mask_S, min_area=700))
mask_T = (som_image==1).astype(np.uint8) * 255
GM.mask_T = (GM.filter_binary_mask_by_area(mask_T, min_area=700))

GM.plot_masks(masks=[GM.mask_S, GM.mask_T], mask_names = ['Stroma', 'Cancer'],
          background_color=(1, 1, 1), 
              mask_colors={'Stroma': (65, 105, 225), 'Cancer': (255, 165, 0)},   # {'Stroma': (0, 0, 255), 'Tumour': (255, 0, 0)
              figsize= (6,6),
              path=None, show=True, ax=None)

2025-06-20 15:34:17,418 - gridgen.get_masks.GetMasks - INFO - Initialized GetMasks

../_images/6112ce237e2c7d9c09bed18bc64009664009b2a31613a8aa653db416af9e200a.png

With the masks, we can proceed and do the remaining pipeline with expansions, resuming mask information and hierarchy analysis as described in other notebooks. Important parameters include the bin size and the number of units in SOM (see appropriate notebook).

Effect of bin size and number of units

In this pipeline there are things that can change the result. One is the bin size and the second one is the number of units in SOM .

We can see the result effect of changing the bin size below:

cmap = mcolors.ListedColormap(['black', 'blue', 'orange'])
legend_labels = {0: 'Empty', 1: 'Cluster 0', 2: 'Cluster 1'}
plot_data = []

for bin_size in [5,10,15,20,30,50,70]:
    min_counts = bin_size *3 
    unique_targets = df_total['target'].unique()
    
    GB = GetBins(bin_size, unique_targets, logger)
    GB.get_bin_cohort(df_list= [df_total], df_name_list = [name_file], cohort_name = 'HLA')
    GB.preprocess_bin(min_counts = min_counts)
    adata = GB.adata
    GC = GetContour(adata, logger)
    GC.run_som(som_shape = (2,1), n_iter = 5000, sigma=.5, learning_rate=.5, random_state = 42)
    
    som_image = GC.get_som_2d_image(bin_size)
    plot_data.append((som_image, bin_size))

2025-06-20 15:35:37,545 - contour_logger - INFO - Time to get bins for 1 dataframes: 2.13 seconds
2025-06-20 15:35:37,546 - contour_logger - INFO - Number of bins: 99374
2025-06-20 15:35:37,546 - contour_logger - INFO - Number of genes: 480
2025-06-20 15:35:38,930 - contour_logger - INFO - Time to run som on 76725 bins: 0.93
2025-06-20 15:35:38,931 - contour_logger - INFO - Number of clusters: 2
2025-06-20 15:35:38,932 - contour_logger - INFO - number of bins in each cluster: cluster_som
0    41714
1    35011
Name: count, dtype: int64
2025-06-20 15:35:43,033 - contour_logger - INFO - Time to get bins for 1 dataframes: 1.44 seconds
2025-06-20 15:35:43,034 - contour_logger - INFO - Number of bins: 28640
2025-06-20 15:35:43,034 - contour_logger - INFO - Number of genes: 480
2025-06-20 15:35:43,525 - contour_logger - INFO - Time to run som on 20234 bins: 0.36
2025-06-20 15:35:43,526 - contour_logger - INFO - Number of clusters: 2
2025-06-20 15:35:43,527 - contour_logger - INFO - number of bins in each cluster: cluster_som
0    10871
1     9363
Name: count, dtype: int64
2025-06-20 15:35:45,627 - contour_logger - INFO - Time to get bins for 1 dataframes: 1.29 seconds
2025-06-20 15:35:45,628 - contour_logger - INFO - Number of bins: 13524
2025-06-20 15:35:45,628 - contour_logger - INFO - Number of genes: 480
2025-06-20 15:35:45,930 - contour_logger - INFO - Time to run som on 9196 bins: 0.24
2025-06-20 15:35:45,931 - contour_logger - INFO - Number of clusters: 2
2025-06-20 15:35:45,932 - contour_logger - INFO - number of bins in each cluster: cluster_som
0    5444
1    3752
Name: count, dtype: int64
2025-06-20 15:35:47,621 - contour_logger - INFO - Time to get bins for 1 dataframes: 1.24 seconds
2025-06-20 15:35:47,622 - contour_logger - INFO - Number of bins: 7842
2025-06-20 15:35:47,622 - contour_logger - INFO - Number of genes: 480
2025-06-20 15:35:47,858 - contour_logger - INFO - Time to run som on 5262 bins: 0.20
2025-06-20 15:35:47,858 - contour_logger - INFO - Number of clusters: 2
2025-06-20 15:35:47,859 - contour_logger - INFO - number of bins in each cluster: cluster_som
1    3084
0    2178
Name: count, dtype: int64
2025-06-20 15:35:49,269 - contour_logger - INFO - Time to get bins for 1 dataframes: 1.11 seconds
2025-06-20 15:35:49,270 - contour_logger - INFO - Number of bins: 3527
2025-06-20 15:35:49,270 - contour_logger - INFO - Number of genes: 480
2025-06-20 15:35:49,456 - contour_logger - INFO - Time to run som on 2390 bins: 0.17
2025-06-20 15:35:49,457 - contour_logger - INFO - Number of clusters: 2
2025-06-20 15:35:49,458 - contour_logger - INFO - number of bins in each cluster: cluster_som
0    1394
1     996
Name: count, dtype: int64
2025-06-20 15:35:50,779 - contour_logger - INFO - Time to get bins for 1 dataframes: 1.10 seconds
2025-06-20 15:35:50,779 - contour_logger - INFO - Number of bins: 1302
2025-06-20 15:35:50,779 - contour_logger - INFO - Number of genes: 480
2025-06-20 15:35:50,935 - contour_logger - INFO - Time to run som on 908 bins: 0.15
2025-06-20 15:35:50,935 - contour_logger - INFO - Number of clusters: 2
2025-06-20 15:35:50,936 - contour_logger - INFO - number of bins in each cluster: cluster_som
0    456
1    452
Name: count, dtype: int64
2025-06-20 15:35:52,176 - contour_logger - INFO - Time to get bins for 1 dataframes: 1.08 seconds
2025-06-20 15:35:52,177 - contour_logger - INFO - Number of bins: 664
2025-06-20 15:35:52,177 - contour_logger - INFO - Number of genes: 480
2025-06-20 15:35:52,331 - contour_logger - INFO - Time to run som on 490 bins: 0.15
2025-06-20 15:35:52,332 - contour_logger - INFO - Number of clusters: 2
2025-06-20 15:35:52,332 - contour_logger - INFO - number of bins in each cluster: cluster_som
1    328
0    162
Name: count, dtype: int64

for som_image, bin_size in plot_data:
    for name, image in som_image.items():
        print(f"Plotting bin size: {bin_size}, SOM name: {name}")
        GC.plot_som(image, cmap=cmap, path=None, show=True, figsize=(5, 5), ax=None, legend_labels=legend_labels)

Plotting bin size: 5, SOM name: TMA1_Selection22

../_images/f6055b1f3765ce00c2a5d4e89b78c6e2e3f6773a2d1c769ed2e46b21c8280730.png

Plotting bin size: 10, SOM name: TMA1_Selection22

../_images/7cbfdbebf8319a31db92c6c786ed15a6e7d7958d8dab3cf8adde0d94f054f906.png

Plotting bin size: 15, SOM name: TMA1_Selection22

../_images/1c00b977b91db5c193e09c3de0e8492b4c1770d086b4f993439d3888d5340801.png

Plotting bin size: 20, SOM name: TMA1_Selection22

../_images/9a99b2158e93f6eed4d417ee100f4f0812796ddfec998c7a31b97036d312bf09.png

Plotting bin size: 30, SOM name: TMA1_Selection22

../_images/7a6d179612d5ece15fa4d1134765ea1918333e977649b82d5aa1d05fcd9d3583.png

Plotting bin size: 50, SOM name: TMA1_Selection22

../_images/7787c5d53f8514921786072fa4abc7ed87a5d8a94d5f014ac4a2dfb41b92584c.png

Plotting bin size: 70, SOM name: TMA1_Selection22

../_images/a75e558e20d062f980a368a3f7996fa49ebd8452a328629f9f67ddab065ca02d.png

check how the number of units and distribution on the SOM map can reflect in the clusters obtained

cmap = mcolors.ListedColormap(['black', 'blue', 'orange'])
legend_labels = {0: 'Empty', 1: 'Cluster 0', 2: 'Cluster 1'}
plot_data = []

for som_size in [(2,1), (3,1), (2,2),(3,3), (5,2)]:
    bin_size = 10
    min_counts = 20 
    unique_targets = df_total['target'].unique()
    
    GB = GetBins(bin_size, unique_targets, logger)
    GB.get_bin_cohort(df_list= [df_total], df_name_list = [name_file], cohort_name = 'HLA')
    GB.preprocess_bin(min_counts = min_counts)
    adata = GB.adata
    GC = GetContour(adata, logger)
    GC.run_som(som_shape = som_size, n_iter = 5000, sigma=.5, learning_rate=.5, random_state = 42)
    
    som_image = GC.get_som_2d_image(bin_size)
    plot_data.append((som_image, bin_size))

2025-06-20 15:36:47,109 - contour_logger - INFO - Time to get bins for 1 dataframes: 1.39 seconds
2025-06-20 15:36:47,109 - contour_logger - INFO - Number of bins: 28640
2025-06-20 15:36:47,109 - contour_logger - INFO - Number of genes: 480
2025-06-20 15:36:47,593 - contour_logger - INFO - Time to run som on 20450 bins: 0.35
2025-06-20 15:36:47,593 - contour_logger - INFO - Number of clusters: 2
2025-06-20 15:36:47,594 - contour_logger - INFO - number of bins in each cluster: cluster_som
0    11561
1     8889
Name: count, dtype: int64
2025-06-20 15:36:49,934 - contour_logger - INFO - Time to get bins for 1 dataframes: 1.52 seconds
2025-06-20 15:36:49,934 - contour_logger - INFO - Number of bins: 28640
2025-06-20 15:36:49,934 - contour_logger - INFO - Number of genes: 480
2025-06-20 15:36:50,432 - contour_logger - INFO - Time to run som on 20450 bins: 0.36
2025-06-20 15:36:50,432 - contour_logger - INFO - Number of clusters: 3
2025-06-20 15:36:50,433 - contour_logger - INFO - number of bins in each cluster: cluster_som
2    10064
0     7324
1     3062
Name: count, dtype: int64
2025-06-20 15:36:52,663 - contour_logger - INFO - Time to get bins for 1 dataframes: 1.41 seconds
2025-06-20 15:36:52,663 - contour_logger - INFO - Number of bins: 28640
2025-06-20 15:36:52,663 - contour_logger - INFO - Number of genes: 480
2025-06-20 15:36:53,195 - contour_logger - INFO - Time to run som on 20450 bins: 0.40
2025-06-20 15:36:53,195 - contour_logger - INFO - Number of clusters: 4
2025-06-20 15:36:53,196 - contour_logger - INFO - number of bins in each cluster: cluster_som
1    9425
2    5733
0    3013
3    2279
Name: count, dtype: int64
2025-06-20 15:36:55,468 - contour_logger - INFO - Time to get bins for 1 dataframes: 1.45 seconds
2025-06-20 15:36:55,469 - contour_logger - INFO - Number of bins: 28640
2025-06-20 15:36:55,469 - contour_logger - INFO - Number of genes: 480
2025-06-20 15:36:56,088 - contour_logger - INFO - Time to run som on 20450 bins: 0.49
2025-06-20 15:36:56,089 - contour_logger - INFO - Number of clusters: 9
2025-06-20 15:36:56,090 - contour_logger - INFO - number of bins in each cluster: cluster_som
6    4778
5    4740
8    3730
3    1867
4    1249
2    1225
0    1176
1    1108
7     577
Name: count, dtype: int64
2025-06-20 15:36:58,350 - contour_logger - INFO - Time to get bins for 1 dataframes: 1.44 seconds
2025-06-20 15:36:58,351 - contour_logger - INFO - Number of bins: 28640
2025-06-20 15:36:58,351 - contour_logger - INFO - Number of genes: 480
2025-06-20 15:36:58,985 - contour_logger - INFO - Time to run som on 20450 bins: 0.50
2025-06-20 15:36:58,985 - contour_logger - INFO - Number of clusters: 10
2025-06-20 15:36:58,986 - contour_logger - INFO - number of bins in each cluster: cluster_som
9    4708
0    4478
1    4189
7    1787
2    1548
6    1067
4    1032
3     612
8     516
5     513
Name: count, dtype: int64

for som_image, bin_size in plot_data:
    for name, image in som_image.items():
        # Get unique clusters from your data (e.g., via np.unique(cluster_data))
        unique_clusters = np.unique(image)
        
        # Create the legend_labels dictionary only for detected clusters
        legend_labels = {i: f'Cluster {i}' for i in unique_clusters}
        
        # Optional: set specific labels if certain clusters have special meanings
        if 0 in unique_clusters:
            legend_labels[0] = 'Empty'
        print(f"Plotting bin size: {bin_size}, SOM name: {name}")
        GC.plot_som(image, cmap=plt.cm.tab10, path=None, show=True, figsize=(5, 5), ax=None, legend_labels=legend_labels)

Plotting bin size: 10, SOM name: TMA1_Selection22

../_images/9945e20088ae9ab90110ae53949e52b2c09810c29319a22ca422e73b32cd2cef.png

Plotting bin size: 10, SOM name: TMA1_Selection22

../_images/12f8067684a0b7f4856714dfcae4917894ec569c1d7aa483adf7f843b5df7a61.png

Plotting bin size: 10, SOM name: TMA1_Selection22

../_images/180a86c063cb4bf69e40b0af54b9ef6f25ae9b77c9d5b95f72b279da3ecca8c5.png

Plotting bin size: 10, SOM name: TMA1_Selection22

../_images/41a7a4bfe620dc29e0f6889c418f20ab5c2e0c180b47be83436f2f19c0b6ee0d.png

Plotting bin size: 10, SOM name: TMA1_Selection22

../_images/61a8de032a24d5b96cdf124cc4acf048efd237e9abc25cdd54b10fdfe58d9a8b.png

General aspects of the pipeline#

Define Cancer Stroma based on density of genes#

How the same analysis look like using SOM.#

This Page