Source code for bullkpy.pl.association_rankplots

from __future__ import annotations

from pathlib import Path
from typing import Literal

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

from ._style import set_style, _savefig



[docs]
def rankplot_association(
    *,
    res: pd.DataFrame,
    gene_col: str = "gene",
    effect_col: str = "effect",      # e.g. log2FC, eta2, mean_diff
    sort_by: str = "qval",           # "qval" | "pval" | effect_col
    direction: str = "both",         # "up" | "down" | "both"
    n_items: int = 20,
    figsize: tuple[float, float] = (7, 6),
    title: str | None = None,
    save: str | Path | None = None,
    show: bool = True,
):
    """
    Ranked barplot for categorical association results.

    Behavior mirrors bk.pl.rankplot():
      • Upregulated / positive = red
      • Downregulated / negative = blue
      • Strongest up at top
      • Most downregulated LAST
    """
    set_style()

    if gene_col not in res.columns:
        raise KeyError(f"'{gene_col}' not found in result table.")
    if effect_col not in res.columns:
        raise KeyError(f"'{effect_col}' not found in result table.")
    if sort_by not in res.columns:
        raise KeyError(f"sort_by='{sort_by}' not found in result table.")

    df = res.copy()
    df = df.dropna(subset=[gene_col, effect_col, sort_by]).copy()
    df[effect_col] = pd.to_numeric(df[effect_col], errors="coerce")
    df = df.dropna(subset=[effect_col]).copy()

    n_items = int(n_items)
    if n_items <= 0:
        raise ValueError("n_items must be > 0")

    # Base ordering
    if sort_by == effect_col:
        df_sorted = df
    else:
        df_sorted = df.sort_values(sort_by, ascending=True)

    if direction == "up":
        sub = df_sorted[df_sorted[effect_col] > 0].copy()
        if sort_by == effect_col:
            sub = sub.sort_values(effect_col, ascending=False).head(n_items)
        else:
            sub = sub.head(n_items).sort_values(effect_col, ascending=False)

        # strongest up at top
        sub = sub.sort_values(effect_col, ascending=False)

    elif direction == "down":
        sub = df_sorted[df_sorted[effect_col] < 0].copy()
        if sort_by == effect_col:
            sub = sub.sort_values(effect_col, ascending=True).head(n_items)
        else:
            sub = sub.head(n_items)

        # most downregulated LAST
        sub = sub.sort_values(effect_col, ascending=False)

    elif direction == "both":
        n_up = n_items // 2
        n_down = n_items - n_up

        up = df_sorted[df_sorted[effect_col] > 0].copy()
        down = df_sorted[df_sorted[effect_col] < 0].copy()

        if sort_by == effect_col:
            up = up.sort_values(effect_col, ascending=False).head(n_up)
            down = down.sort_values(effect_col, ascending=True).head(n_down)
        else:
            up = up.head(n_up)
            down = down.head(n_down)

        # enforce final display order
        up = up.sort_values(effect_col, ascending=False)
        down = down.sort_values(effect_col, ascending=False)

        sub = pd.concat([up, down], axis=0)

    else:
        raise ValueError("direction must be 'up', 'down', or 'both'.")

    labels = sub[gene_col].astype(str).tolist()
    vals = sub[effect_col].to_numpy(dtype=float)

    colors = np.where(vals >= 0, "#D62728", "#1F77B4")  # red / blue

    fig, ax = plt.subplots(figsize=figsize, constrained_layout=True)
    ax.barh(labels, vals, color=colors)
    ax.axvline(0, linewidth=1)

    ax.set_xlabel(effect_col)
    ax.set_ylabel(gene_col)
    ax.invert_yaxis()

    ax.set_title(title or "Ranked associations")

    if save is not None:
        _savefig(fig, save)
    if show:
        plt.show()

    return fig, ax



[docs]
def dotplot_association(
    df: pd.DataFrame,
    *,
    feature_col: str,            # "gene" or "obs"
    groupby_col: str = "groupby",
    effect_col: str = "effect",
    q_col: str = "qval",
    top_n: int = 50,
    figsize: tuple[float, float] | None = None,
    cmap: str = "RdBu_r",
    vmin: float | None = None,
    vmax: float | None = None,
    size_min: float = 10.0,
    size_max: float = 250.0,
    title: str | None = None,
    save: str | Path | None = None,
    show: bool = True,
):
    """
    Scanpy-like dotplot for association results across multiple contrasts/groupby runs.
      - dot color = effect
      - dot size  = -log10(qval)
    Works when df contains multiple groupby/contrasts (groupby_col), otherwise still works.
    """
    set_style()

    d = df.copy()
    d = d.replace([np.inf, -np.inf], np.nan).dropna(subset=[feature_col, groupby_col, effect_col, q_col])

    # take top_n per groupby_col
    d = d.sort_values([groupby_col, q_col], ascending=[True, True])
    d = d.groupby(groupby_col, group_keys=False).head(int(top_n))

    piv_eff = d.pivot_table(index=feature_col, columns=groupby_col, values=effect_col, aggfunc="first")
    piv_q = d.pivot_table(index=feature_col, columns=groupby_col, values=q_col, aggfunc="first")

    eff = piv_eff.to_numpy(dtype=float)
    qq = piv_q.to_numpy(dtype=float)
    siz = -np.log10(np.maximum(qq, 1e-300))

    # scale size
    smin, smax = np.nanmin(siz), np.nanmax(siz)
    if not np.isfinite(smin) or not np.isfinite(smax) or smax == smin:
        u = np.zeros_like(siz)
    else:
        u = (siz - smin) / (smax - smin)
    sizes = size_min + (size_max - size_min) * u

    if figsize is None:
        figsize = (max(6, 0.45 * piv_eff.shape[1] + 2.8), max(4, 0.22 * piv_eff.shape[0] + 1.6))

    fig, ax = plt.subplots(figsize=figsize, constrained_layout=True)

    if vmin is None:
        vmin = float(np.nanmin(eff))
    if vmax is None:
        vmax = float(np.nanmax(eff))
    norm = mpl.colors.TwoSlopeNorm(vmin=vmin, vcenter=0.0, vmax=vmax) if (vmin < 0 < vmax) else mpl.colors.Normalize(vmin=vmin, vmax=vmax)
    cm = mpl.cm.get_cmap(cmap)

    xs = np.arange(piv_eff.shape[1])
    ys = np.arange(piv_eff.shape[0])

    for i in range(piv_eff.shape[0]):
        ax.scatter(
            xs,
            np.full_like(xs, ys[i]),
            s=sizes[i, :],
            c=cm(norm(eff[i, :])),
            edgecolors="0.2",
            linewidths=0.3,
        )

    ax.set_xticks(xs)
    ax.set_xticklabels([str(c) for c in piv_eff.columns], rotation=90)
    ax.set_yticks(ys)
    ax.set_yticklabels([str(r) for r in piv_eff.index])
    ax.invert_yaxis()

    sm = mpl.cm.ScalarMappable(norm=norm, cmap=cm)
    cbar = fig.colorbar(sm, ax=ax, pad=0.01)
    cbar.set_label(effect_col)

    ax.set_title(title or f"Association dotplot: color={effect_col}, size=-log10({q_col})")

    if save is not None:
        _savefig(fig, save)
    if show:
        plt.show()
    return fig, ax




[docs]
def heatmap_association(
    df: pd.DataFrame,
    *,
    feature_col: str,
    groupby_col: str = "groupby",
    value_col: str = "effect",
    top_n: int = 60,
    cmap: str = "RdBu_r",
    center: float = 0.0,
    figsize: tuple[float, float] | None = None,
    title: str | None = None,
    save: str | Path | None = None,
    show: bool = True,
):
    """
    Heatmap of association values (effect by default), selecting top_n rows by best qval per column.
    """
    set_style()

    d = df.copy()
    if "qval" in d.columns:
        d = d.sort_values(["qval"], ascending=True)
    d = d.replace([np.inf, -np.inf], np.nan).dropna(subset=[feature_col, groupby_col, value_col])

    # pick top_n unique features overall (simple + robust)
    feats = d[feature_col].astype(str).drop_duplicates().head(int(top_n)).tolist()
    d = d[d[feature_col].astype(str).isin(feats)]

    piv = d.pivot_table(index=feature_col, columns=groupby_col, values=value_col, aggfunc="first")
    mat = piv.to_numpy(dtype=float)

    if figsize is None:
        figsize = (max(6, 0.45 * piv.shape[1] + 3), max(4, 0.22 * piv.shape[0] + 1.8))

    fig, ax = plt.subplots(figsize=figsize, constrained_layout=True)
    im = ax.imshow(mat, aspect="auto", cmap=cmap)
    ax.set_xticks(np.arange(piv.shape[1]))
    ax.set_xticklabels([str(c) for c in piv.columns], rotation=90)
    ax.set_yticks(np.arange(piv.shape[0]))
    ax.set_yticklabels([str(r) for r in piv.index])
    ax.invert_yaxis()

    cbar = fig.colorbar(im, ax=ax, pad=0.01)
    cbar.set_label(value_col)

    ax.set_title(title or f"Association heatmap ({value_col})")

    if save is not None:
        _savefig(fig, save)
    if show:
        plt.show()
    return fig, ax