diff --git a/python/mlx/nn/layers/activations.py b/python/mlx/nn/layers/activations.py index 7b7bb138e6..3c834679d1 100644 --- a/python/mlx/nn/layers/activations.py +++ b/python/mlx/nn/layers/activations.py @@ -66,7 +66,7 @@ def leaky_relu(x, negative_slope=0.01): def log_softmax(x, axis=-1): r"""Applies the Log Softmax function. - Applies :math:`x + \log \sum_i e^{x_i}` element wise. + Applies :math:`x - \log \sum_i e^{x_i}` element wise. """ return x - mx.logsumexp(x, axis=axis, keepdims=True) @@ -176,7 +176,7 @@ def gelu_approx(x): .. math:: - x = 0.5 * x * \left(1 + \text{Tanh}\left((\sqrt{2 / \pi} * \left(x + 0.044715 * x^3\right)\right)\right) + x = 0.5 * x * \left(1 + \text{Tanh}\left(\sqrt{2 / \pi} * \left(x + 0.044715 * x^3\right)\right)\right) """ return 0.5 * x * (1 + mx.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * x**3))) @@ -552,7 +552,7 @@ class GELU(Module): However, if ``approx`` is set to 'precise' or 'fast' it applies .. math:: - \textrm{GELUApprox}(x) &= 0.5 * x * \left(1 + \text{Tanh}\left((\sqrt{2 / \pi} * \left(x + 0.044715 * x^3\right)\right)\right) \\ + \textrm{GELUApprox}(x) &= 0.5 * x * \left(1 + \text{Tanh}\left(\sqrt{2 / \pi} * \left(x + 0.044715 * x^3\right)\right)\right) \\ \textrm{GELUFast}(x) &= x * \sigma\left(1.702 * x\right) respectively.