Source code for treefit.fit

# Copyright (C) 2020  Momoko Hayamizu <hayamizu@ism.ac.jp>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this program.  If not, see
# <http://www.gnu.org/licenses/>.

import math
import matplotlib.pyplot as plt
import matplotlib.ticker
import numpy as np
import pandas as pd
import scipy
import sklearn.decomposition

def perturbate_poisson(counts, strength=1.0):
    return (np.random.poisson(counts) * strength).astype(counts.dtype)

def calculate_distance_matrix(expression):
    # Use 'euclidean' method for now.
    # TODO: make method parameter
    return scipy.spatial.distance.cdist(expression, expression, 'euclidean')

def calculate_mst(expression):
    distance_matrix = calculate_distance_matrix(expression)
    mst = scipy.sparse.csgraph.minimum_spanning_tree(distance_matrix)
    # Remove weights
    mst[mst > 0] = 1
    return mst + mst.T

def perturbate_knn(expression, strength=1.0):
    n_samples, n_features = expression.shape
    # TODO: Improve
    k_nearest_neighbors = round(n_samples * 0.0125)
    if k_nearest_neighbors < 2:
        k_nearest_neighbors = 2
    standard_deviation = strength / np.sqrt(k_nearest_neighbors)
    distance_matrix = calculate_distance_matrix(expression)
    perturbated_expression = np.zeros((n_samples, n_features))
    for i in range(n_samples):
        sorted_indices = np.argsort(distance_matrix[i, :])
        nearest_neighbors = \
            expression[sorted_indices[1:(k_nearest_neighbors + 1)]]
        diffs = nearest_neighbors - expression[i]
        weights = np.random.normal(scale=standard_deviation,
                                   size=(n_features,))
        weighted_diffs = diffs * weights
        perturbated_expression[i] = expression[i] + np.sum(weighted_diffs, axis=0)
    return perturbated_expression

def calculate_low_dimension_laplacian_eigenvectors(mst, p):
    laplacian = scipy.sparse.csgraph.laplacian(mst)
    eigenvalues, eigenvectors = np.linalg.eigh(laplacian.toarray())
    while len(eigenvalues) > 0 and math.isclose(eigenvalues[0], 0, abs_tol=1e-9):
        eigenvalues = eigenvalues[1:]
        eigenvectors = eigenvectors[:, 1:]
    low_dimension_values = eigenvalues[0:p]
    low_dimension_vectors = eigenvectors[:, 0:p]
    if len(np.unique(low_dimension_values)) != len(low_dimension_values):
        low_dimension_vectors = scipy.linalg.orth(low_dimension_vectors)
    low_dimension_vectors /= np.linalg.norm(low_dimension_vectors, axis=0)
    return low_dimension_vectors

def calculate_canonical_correlation(u, v):
    uTv = np.matmul(u.T, v)
    return scipy.linalg.svd(uTv)[1]

def calculate_grassmann_distance_max_cca(canonical_correlation):
    max_cos_theta = np.max(canonical_correlation)
    return np.sqrt(np.max([0, 1 - max_cos_theta ** 2]))

def calculate_grassmann_distance_rms_cca(canonical_correlation):
    return np.sqrt(np.mean(1 - canonical_correlation ** 2))

def calculate_eigenvectors_list(original,
                                perturbations,
                                normalize,
                                reduce_dimension,
                                build_tree,
                                max_p,
                                n_perturbations):
    poisson_strength = 1.0
    # TODO: Improve
    knn_strength = 0.2 * (500 / 200) ** 0.5
    targets = []

    if perturbations is None or 'poisson' in perturbations:
        counts = original.get('counts')
        if counts:
            targets.append(counts)
            for i in range(n_perturbations):
                perturbated_counts = perturbate_poisson(counts, poisson_strength)
                targets.append(perturbated_counts)
        elif not perturbations is None:
            raise TypeError('no count data: %s' % original)

        if len(targets) > 0:
            # TODO: Normalize
            pass

    if perturbations is None or 'knn' in perturbations:
        if len(targets) == 0:
            expression = original.get('expression')
            if expression is None:
                raise TypeError('no expression data: %s' % original)

            targets.append(expression)
            for i in range(n_perturbations):
                perturbated_expression = perturbate_knn(expression, knn_strength)
                targets.append(perturbated_expression)
        else:
            targets = \
                [perturbate_knn(expression, knn_strength) for target in targets]

    def calculate(target):
        if reduce_dimension is None or isinstance(reduce_dimension, int):
            if isinstance(reduce_dimension, int):
                n_dimensions = reduce_dimension
            else:
                n_dimensions = None
            pca = sklearn.decomposition.PCA(n_dimensions)
            target = pca.fit_transform(target)
        elif reduce_dimension:
            target = reduce_dimension(target)
        tree = calculate_mst(target)
        return calculate_low_dimension_laplacian_eigenvectors(tree, max_p)

    return list(map(calculate, targets))

[docs]class Fit: """The estimated result of :py:func:`treefit.treefit`. Attributes ---------- max_cca_distance: pandas.DataFrame The result of max canonical correlation analysis distance. It has the following columns: * ``p``: Dimensionality of the feature space of tree structures. * ``mean``: The mean of the target distance values. * ``standard_deviation``: The standard deviation of the target distance values. rms_cca_distance: pandas.DataFrame The result of root mean square canonical correlation analysis distance. This has the same columns as ``max_cca_distance``. n_principal_paths_candidates: [int] The candidates of the number of principal paths. """
[docs] def __init__(self, name, max_cca_distance, rms_cca_distance, n_principal_paths_candidates): self.name = name self.max_cca_distance = max_cca_distance self.rms_cca_distance = rms_cca_distance self.n_principal_paths_candidates = n_principal_paths_candidates
def __str__(self): class_name = f'{self.__class__.__module__}.{self.__class__.__qualname__}' return f"""{class_name}: {self.name} max_cca_distance: {self.max_cca_distance} rms_cca_distance: {self.rms_cca_distance} n_principal_paths_candidates: {self.n_principal_paths_candidates}"""
def treefit(target, name=None, perturbations=None, normalize=None, reduce_dimension=None, build_tree=None, max_p=20, verbose=False, n_perturbations=20): """Estimate the goodness-of-fit between tree models and data. Parameters ---------- target : dict The target data to be estimated. It must be one of them: * ``{"counts": COUNTS}`` * ``{"expression": EXPRESSION}`` ``COUNTS`` and ``EXPRESSION`` are ``numpy.array``. The rows and columns correspond to samples such as cells and features such as genes. ``COUNTS``'s value is count data such as the number of genes expressed. ``EXPRESSION``'s value is normalized count data. name : string The name of target as string. perturbations : list How to perturbate the target data. If this is ``None``, all available perturbation methods are used. You can specify used perturbation methods as ``list``. Here are available methods: * ``"poisson"``: A perturbation method for counts data. * ``"knn"``: A perturbation method for expression data. normalize : callable How to normalize counts data. If this is ``None``, the default normalization is applied. You can specify a ``callable`` object that normalized counts data. reduce_dimension : callable How to reduce dimension of normalized count data. If this is ``None``, the default dimensionality reduction is applied. You can specify a ``callable`` object that reduces dimension of normalized counts data. build_tree : callable How to build a tree of expression data. If this is ``None``, MST is built. You can specify a function that builds tree of normalized counts data. max_p : int How many low dimension Laplacian eigenvectors are used. The default is ``20``. n_perturbations : int How many times to perturb. The default is `20`. Returns ------- fit : treefit.fit.Fit An estimated result as a :py:class:`treefit.fit.Fit` object. Examples -------- >>> import treefit # Generate a star tree data that have normalized expression values # not count data. >>> star = treefit.data.generate_2d_n_arms_star_data(300, 3, 0.1) # Estimate tree-likeness of the tree data. >>> fit = treefit.treefit({"expression": star}) """ if name is None: name = "fit" eigenvectors_list = calculate_eigenvectors_list(target, perturbations, normalize, reduce_dimension, build_tree, max_p, n_perturbations) ps = [] max_cca_distance_means = [] max_cca_distance_standard_deviations = [] rms_cca_distance_means = [] rms_cca_distance_standard_deviations = [] for p in range(1, max_p + 1): ps.append(p) max_cca_distance_values = [] rms_cca_distance_values = [] for i in range(1, len(eigenvectors_list)): u = eigenvectors_list[0][:, 0:p] v = eigenvectors_list[i][:, 0:p] canonical_correlation = calculate_canonical_correlation(u, v) max_cca_distance_values.append( calculate_grassmann_distance_max_cca(canonical_correlation)) rms_cca_distance_values.append( calculate_grassmann_distance_rms_cca(canonical_correlation)) max_cca_distance_means.append(np.mean(max_cca_distance_values)) max_cca_distance_standard_deviations.append( np.std(max_cca_distance_values)) rms_cca_distance_means.append(np.mean(rms_cca_distance_values)) rms_cca_distance_standard_deviations.append( np.std(rms_cca_distance_values)) n_principal_paths_candidates = [] for p in range(1, max_p - 1): if p == 1: rms_cca_distance_mean_before = float("inf") else: rms_cca_distance_mean_before = rms_cca_distance_means[p - 2] rms_cca_distance_mean = rms_cca_distance_means[p - 1] rms_cca_distance_mean_after = rms_cca_distance_means[p] if rms_cca_distance_mean_before > rms_cca_distance_mean and \ rms_cca_distance_mean < rms_cca_distance_mean_after: n_principal_paths_candidates.append(p + 1) max_cca_distance = pd.DataFrame({ 'p': ps, 'mean': max_cca_distance_means, 'standard_deviation': max_cca_distance_standard_deviations, }) rms_cca_distance = pd.DataFrame({ 'p': ps, 'mean': rms_cca_distance_means, 'standard_deviation': rms_cca_distance_standard_deviations, }) return Fit(name, max_cca_distance, rms_cca_distance, n_principal_paths_candidates) def plot(*fits): """Plot estimated results to get insight. Parameters ---------- *fits : [treefit.fit.Fit] The estimated results by treefit.treefit() to be visualized. Examples -------- >>> import treefit # Generate a tree data. >>> tree = treefit.data.generate_2d_n_arms_star_data(200, 3, 0.1) # Estimate the goodness-of-fit between tree models and the tree data. >>> fit = treefit.treefit({"expression": tree}, "tree") # Visualize the estimated result. >>> treefit.plot(fit) # You can mix multiple estimated results by adding "name" column. >>> tree2 = treefit.data.generate_2d_n_arms_star_data(200, 3, 0.9) >>> fit2 = treefit.treefit({"expression": tree2}, "tree2") >>> treefit.plot(fit, fit2) """ fig, axes = plt.subplots(1, 2, figsize=(10, 6)) max_ax = axes[0] rms_ax = axes[1] def plot_data_frame(ax, title, value_label, data_frame): p = data_frame['p'] mean = data_frame['mean'] standard_deviation = data_frame['standard_deviation'] ax.set_title(title) ax.set_xlabel('p: Dimensionality of the feature space of trees') ax.set_ylabel('%s (mean and SD)' % value_label) ax.plot(p, mean) ax.fill_between(p, mean - standard_deviation, mean + standard_deviation, alpha=0.2, zorder=-10) ax.xaxis.set_major_locator(matplotlib.ticker.MultipleLocator(1)) for fit in fits: plot_data_frame(max_ax, 'Analysis of the structural instability\n' + 'of the estimated trees', 'max_cca_distance', fit.max_cca_distance) plot_data_frame(rms_ax, 'Prediction for\nthe number of principal paths', 'rms_cca_distance', fit.rms_cca_distance) if len(fits) > 1: legend = [fit.name for fit in fits] max_ax.legend(legend) rms_ax.legend(legend)