"""Module for Gaussian Method to rank neurons
This module implements the Gaussian Method to rank the neuron importance
.. seealso::
Lucas Torroba Hennigen, Adina Williams, and Ryan Cotterell. Intrinsic probing through dimension
selection. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language
Processing (EMNLP), pp. 197–216, Online, 2020. Association for Computational Linguistics. doi:
10.18653/v1/2020.emnlp-main.15.a
"""
import numpy as np
import torch
import torch.distributions.multivariate_normal as mn
from . import metrics
"""
Modified from original version at
https://github.com/technion-cs-nlp/Individual-Neurons-Pitfalls/blob/main/Gaussian.py
"""
[docs]class GaussianProbe:
[docs] def __init__(self, X, y):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.train_features = torch.tensor(X).to(self.device)
self.train_labels = torch.tensor(y).to(self.device).long()
self.labels_dim = len(set(self.train_labels.tolist()))
# self._get_categorical()
self._get_mean_and_cov()
self.feature_sets = {"train": self.train_features}
self.label_sets = {"train": self.train_labels}
self.categorical_sets = {}
self.train_categorical = self._get_categorical("train")
self.categorical_sets["train"] = self.train_categorical
def _get_categorical(self, set_name):
counts = torch.histc(self.label_sets[set_name].float(), bins=self.labels_dim)
categorical = (counts / self.label_sets[set_name].size()[0]).to(self.device)
return categorical
def _get_mean_and_cov(self):
self.features_by_label = [
self.train_features[
(self.train_labels == label).nonzero(as_tuple=False)
].squeeze(1)
for label in range(self.labels_dim)
]
empirical_means = torch.stack(
[features.mean(dim=0) for features in self.features_by_label]
)
empirical_covs = [
torch.tensor(np.cov(features.cpu(), rowvar=False))
for features in self.features_by_label
]
mu_0 = empirical_means # [label_dim,feature_dim]
lambda_0 = torch.stack(
[torch.diag(torch.diagonal(cov)) for cov in empirical_covs]
).to(self.device)
v_0 = torch.tensor(self.train_features.shape[1] + 2).to(self.device) # int
k_0 = torch.tensor(0.01).to(self.device)
N_v = torch.tensor(
[features.shape[0] for features in self.features_by_label]
).to(
self.device
) # [label_dim]
k_n = k_0 + N_v # [label_dim]
v_n = v_0 + N_v # [label_dim]
mu_n = (k_0 * mu_0 + N_v.unsqueeze(1) * empirical_means) / k_n.unsqueeze(
1
) # [label_dim,feature_dim]
S = []
for label in range(self.labels_dim):
features_minus_mean = self.features_by_label[label] - empirical_means[label]
S.append(features_minus_mean.T @ features_minus_mean)
S = torch.stack(S).to(self.device)
lambda_n = lambda_0 + S
self.mu_star = mu_n
sigma_star = lambda_n / (v_n + self.train_features.shape[1] + 2).view(
self.labels_dim, 1, 1
)
min_eig = []
for sigma in sigma_star:
eigs = torch.eig(sigma).eigenvalues[:, 0]
min_eig.append(eigs.min())
min_eig = torch.tensor(min_eig).to(self.device)
sigma_star[min_eig < 0] -= (
min_eig.view(sigma_star.shape[0], 1, 1)[min_eig < 0]
* torch.eye(sigma_star.shape[1]).to(self.device)
* torch.tensor(10).to(self.device)
)
self.sigma_star = sigma_star
def _get_distributions(self, selected_features):
self.distributions = []
for label in range(self.labels_dim):
if self.train_categorical[label].item() == 0:
self.distributions.append(torch.distributions.normal.Normal(0.0, 0.0))
else:
self.distributions.append(
mn.MultivariateNormal(
self.mu_star[label, selected_features].double(),
self.sigma_star[label, selected_features][:, selected_features],
)
)
def _compute_probs(self, selected_features, set_name):
features = self.feature_sets[set_name]
with torch.no_grad():
log_probs = []
for i in range(self.labels_dim):
if self.train_categorical[i] == 0:
log_probs.append(torch.zeros(features.shape[0], dtype=torch.double))
else:
log_probs.append(
self.distributions[i].log_prob(features[:, selected_features])
)
log_probs = torch.stack(log_probs, dim=1)
log_prob_times_cat = log_probs + self.train_categorical.log()
self.not_normalized_probs = log_prob_times_cat
self.normalizer = log_prob_times_cat.logsumexp(dim=1)
self.probs = log_prob_times_cat - self.normalizer.unsqueeze(1)
def _predict(self, set_name: str):
preds = self.probs.argmax(dim=1)
labels = self.label_sets[set_name]
accuracy = ((preds == labels)).nonzero(as_tuple=False).shape[0] / labels.shape[
0
]
categorical = self.categorical_sets[set_name]
entropy = torch.distributions.Categorical(categorical).entropy()
with torch.no_grad():
conditional_entropy = -self.probs[
list(range(self.probs.shape[0])), labels
].mean()
mutual_inf = (entropy - conditional_entropy) / torch.tensor(2.0).log()
return preds, labels, accuracy, mutual_inf.item(), (mutual_inf / entropy).item()
[docs]def train_probe(X, y):
"""
Train a Gaussian probe.
This method trains a linear classifier that can be used as a probe to perform
neuron analysis. Use this method when the task that is being probed for is a
classification task. A logistic regression model is trained with Cross
Entropy loss. The optimizer used is Adam with default ``torch.optim``
package hyperparameters.
Parameters
----------
X_train : numpy.ndarray
Numpy Matrix of size [``NUM_TOKENS`` x ``NUM_NEURONS``]. Usually the
output of ``interpretation.utils.create_tensors``. ``dtype`` of the
matrix must be ``np.float32``
y_train : numpy.ndarray
Numpy Vector with 0-indexed class labels for each input token. The size
of the vector must be [``NUM_TOKENS``]. Usually the output of
``interpretation.utils.create_tensors``. Assumes that class labels are
continuous from ``0`` to ``NUM_CLASSES-1``. ``dtype`` of the
matrix must be ``np.int``
Returns
-------
probe : interpretation.linear_probe.LinearProbe
Trained probe for the given task.
"""
return GaussianProbe(X, y)
[docs]def evaluate_probe(
probe,
X_test,
y_test,
metric="accuracy",
return_predictions=False,
selected_neurons=None,
):
"""
Evaluates a trained probe.
This method evaluates a trained probe on the given data, and supports
several standard metrics.
The probe is always evaluated in full precision, regardless of the dtype
of ``X`` and regardless of the device (CPU/GPU).
If ``X`` and the ``probe`` object are provided with a different dtype,
they are converted to float32. ``X`` is converted in batches.
Parameters
----------
probe : interpretation.linear_probe.LinearProbe
Trained probe model
X : numpy.ndarray
Numpy Matrix of size [``NUM_TOKENS`` x ``NUM_NEURONS``]. Usually the
output of ``interpretation.utils.create_tensors``.
y : numpy.ndarray
Numpy Vector of size [``NUM_TOKENS``] with class labels for each input
token. For classification, 0-indexed class labels for each input token
are expected. For regression, a real value per input token is expected.
Usually the output of ``interpretation.utils.create_tensors``
return_predictions : bool, optional
If set to True, actual predictions are also returned along with scores
for further use. Defaults to False.
metrics : str, optional
Metric to use for evaluation scores. For supported metrics see
``interpretation.metrics``
Returns
-------
scores : dict
The overall score on the given data with the key ``__OVERALL__``. If
``idx_to_class`` mapping is provided, additional keys representing each
class and their associated scores are also part of the dictionary.
predictions : list of 3-tuples, optional
If ``return_predictions`` is set to True, this list will contain a
3-tuple for every input sample, representing
``(source_token, predicted_class, was_predicted_correctly)``
"""
if selected_neurons == None:
selected_neurons = list(np.arange(X_test.shape[1]))
probe.test_features = torch.tensor(X_test).to(probe.device)
probe.test_labels = torch.tensor(y_test).to(probe.device).long()
probe.feature_sets["test"] = probe.test_features
probe.label_sets["test"] = probe.test_labels
probe.test_categorical = probe._get_categorical("test")
probe.categorical_sets["test"] = probe.test_categorical
probe._get_distributions(selected_neurons)
probe._compute_probs(selected_neurons, "test")
preds, labels, test_acc, test_mi, test_nmi = probe._predict("test")
result = metrics.compute_score(preds, labels, metric)
if return_predictions:
return preds, result
return result
[docs]def get_neuron_ordering(probe, num_of_neurons):
"""
Get global ordering of neurons from a trained probe.
This method returns the global ordering of neurons in a model based on
the Gaussian Method.
Parameters
----------
probe : interpretation.gaussian_probe.GaussianProbe
Trained probe model
num_of_neurons: the number of neurons you want, the method requires a not too large number (<400)
Returns
-------
neuron_ordering : list
Numpy array of size ``NUM_NEURONS`` with neurons in decreasing order
of importance.
"""
selected_neurons = []
for num_of_neuron in range(0, num_of_neurons):
best_neuron = -1
best_acc = 0.0
best_mi, best_nmi = float("-inf"), float("-inf")
acc_on_best_mi = 0.0
mi_on_best_acc, nmi_on_best_acc = 0.0, 0.0
for neuron in range(probe.train_features.shape[1]):
if neuron in selected_neurons:
continue
probe._get_distributions(selected_neurons + [neuron])
probe._compute_probs(selected_neurons + [neuron], "train")
preds, labels, acc, mi, nmi = probe._predict("train")
if mi > best_mi:
best_mi = mi
best_nmi = nmi
best_neuron = neuron
acc_on_best_mi = acc
selected_neurons.append(best_neuron)
return selected_neurons