Source code for bullkpy.pp.hvg

from __future__ import annotations

import numpy as np
import pandas as pd
import scipy.sparse as sp
import anndata as ad

from ..logging import info, warn


[docs] def highly_variable_genes( adata: ad.AnnData, *, layer: str | None = "log1p_cpm", n_top_genes: int = 2000, n_bins: int = 20, min_mean: float = 0.0, max_mean: float = np.inf, min_disp: float = -np.inf, key_added: str = "highly_variable", ) -> None: """ Select highly variable genes (bulk-friendly version of HVGs). This computes mean and variance across samples on the chosen layer and identifies genes with high dispersion relative to genes of similar mean (mean-binned z-score of log-dispersion, Scanpy/Seurat spirit). Stores results in `adata.var`: - mean, variance, dispersion, dispersion_norm - boolean flag `adata.var[key_added]` (default: 'highly_variable') """ X = adata.layers[layer] if layer is not None else adata.X info(f"Computing HVGs from layer={layer!r} with n_top_genes={n_top_genes}") if sp.issparse(X): X = X.tocsr() mean = np.asarray(X.mean(axis=0)).ravel() mean_sq = np.asarray(X.power(2).mean(axis=0)).ravel() var = mean_sq - mean**2 var[var < 0] = 0.0 # numerical safety else: X = np.asarray(X, dtype=float) mean = X.mean(axis=0) var = X.var(axis=0, ddof=0) # Dispersion (variance / mean) in log space (stable) with np.errstate(divide="ignore", invalid="ignore"): disp = var / mean disp[~np.isfinite(disp)] = 0.0 # Filter by mean/disp thresholds (optional) valid = ( (mean >= min_mean) & (mean <= max_mean) & (disp >= min_disp) ) if valid.sum() == 0: raise ValueError("No genes pass mean/disp filters; relax thresholds.") # Bin genes by mean (on log scale to spread bins better) mean_for_bins = np.log1p(mean) bins = pd.qcut(mean_for_bins[valid], q=min(n_bins, int(valid.sum())), duplicates="drop") disp_log = np.log1p(disp) # log(1+disp) disp_norm = np.full_like(disp_log, fill_value=np.nan, dtype=float) # Z-score within mean bins valid_idx = np.where(valid)[0] for b in bins.categories: mask = np.zeros_like(valid, dtype=bool) # bins is only for valid genes; map back to global indices in_bin = np.asarray(bins == b) mask[valid_idx[in_bin]] = True vals = disp_log[mask] if vals.size < 2: continue mu = np.mean(vals) sd = np.std(vals, ddof=0) if sd == 0: sd = 1.0 disp_norm[mask] = (vals - mu) / sd # Rank genes by normalized dispersion (higher = more variable) # Fill NaNs with -inf so they never get selected score = np.nan_to_num(disp_norm, nan=-np.inf) n_top = int(min(n_top_genes, score.size)) top_idx = np.argsort(score)[::-1][:n_top] hvg = np.zeros(score.size, dtype=bool) hvg[top_idx] = True # Store in adata.var adata.var["means"] = mean adata.var["variances"] = var adata.var["dispersions"] = disp adata.var["dispersions_norm"] = disp_norm adata.var[key_added] = hvg info(f"Marked {hvg.sum()} highly variable genes in adata.var['{key_added}']")