import math
from typing import Union, Set, List, Callable
from collections import defaultdict

class Node:
    """
    A Node in the computation graph that supports reverse mode automatic differentiation.
    """

    def __init__(self, data: float, _children: tuple = (), _op: str = '', label: str = ''):
        self.data = data
        self.grad = 0.0
        self._backward = lambda: None  # Function to compute local gradients, implements chain rule
        self._prev = set(_children) # Parent nodes in the computation graph
        self._op = _op # Operation that produced this node
        self.label = label # Label for the node, debugging purposes

    def __repr__(self):
        return f"Node(data={self.data}, grad={self.grad})"

    def __add__(self, other):
        other = other if isinstance(other, Node) else Node(other)
        out = Node(self.data + other.data, (self, other), '+') # Evaluate the operation and create a new node

        # Save information about this computation's gradient so we can compute the gradients
        # during the backward pass. For addition, the gradient is 1 for both operands since
        # ∂out/∂self = 1 and ∂out/∂other = 1, and so the chain rule gives us:
        # ∂out/∂self * ∂self/∂x + ∂out/∂other * ∂other/∂x = 1 * ∂self/∂x + 1 * ∂other/∂x
        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        out._backward = _backward

        return out

    def __mul__(self, other):
        other = other if isinstance(other, Node) else Node(other)
        out = Node(self.data * other.data, (self, other), '*') # Evaluate the operation and create a new node

        # Again, save information about this computation's gradient.
        # For multiplication, the gradient is given by the product rule:
        # ∂out/∂self = other.data and ∂out/∂other = self.data
        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        return out

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Node(self.data**other, (self,), f'**{other}') # Evaluate the operation and create a new node

        # For exponentiation, the gradient is given by the power rule:
        # ∂out/∂self = other * self.data**(other - 1)
        # Note: this is a simplification, in general we would need to consider the chain rule
        # as well, but since we are only supporting int/float powers, we can ignore that.
        def _backward():
            self.grad += other * (self.data ** (other - 1)) * out.grad
        out._backward = _backward

        return out

    def __rmul__(self, other):
        return self * other

    def __truediv__(self, other):
        return self * other**-1

    def __neg__(self):
        return self * -1

    def __sub__(self, other):
        return self + (-other)

    def __rsub__(self, other):
        return other + (-self)

    def exp(self):
        out = Node(math.exp(self.data), (self,), 'exp')

        def _backward():
            self.grad += out.data * out.grad
        out._backward = _backward

        return out

    def log(self):
        out = Node(math.log(self.data), (self,), 'log')

        def _backward():
            self.grad += (1.0 / self.data) * out.grad
        out._backward = _backward

        return out

    def sin(self):
        out = Node(math.sin(self.data), (self,), 'sin')

        def _backward():
            self.grad += math.cos(self.data) * out.grad
        out._backward = _backward

        return out

    def cos(self):
        out = Node(math.cos(self.data), (self,), 'cos')

        def _backward():
            self.grad += -math.sin(self.data) * out.grad
        out._backward = _backward

        return out

    def tan(self):
        out = Node(math.tan(self.data), (self,), 'tan')

        def _backward():
            self.grad += (1.0 / math.cos(self.data)**2) * out.grad
        out._backward = _backward

        return out

    def tanh(self):
        t = math.tanh(self.data)
        out = Node(t, (self,), 'tanh')

        def _backward():
            self.grad += (1 - t**2) * out.grad
        out._backward = _backward

        return out

    def relu(self):
        out = Node(0 if self.data < 0 else self.data, (self,), 'ReLU')

        def _backward():
            self.grad += (out.data > 0) * out.grad
        out._backward = _backward

        return out

    def backward(self):
        """
        Perform reverse mode automatic differentiation to compute gradients
        using the topological order of the computation graph and stored
        backward functions for each node.
        """

        topo = []
        visited = set()

        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)

        build_topo(self)

        self.grad = 1.0

        for node in reversed(topo):
            node._backward()

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

def f1(x_node, y_node):
    return x_node * y_node + x_node.sin()

def f2(x_node, y_node):
    return x_node**2 + y_node**2

x_vals = np.linspace(-2, 2, 20)
y_vals = np.linspace(-2, 2, 20)
X, Y = np.meshgrid(x_vals, y_vals)

Z1 = np.zeros_like(X)
grad_x1 = np.zeros_like(X)
grad_y1 = np.zeros_like(Y)

Z2 = np.zeros_like(X)
grad_x2 = np.zeros_like(X)
grad_y2 = np.zeros_like(Y)

for i in range(len(x_vals)):
    for j in range(len(y_vals)):
        x = Node(X[i, j], label=f'x1_{i}_{j}')
        y = Node(Y[i, j], label=f'y1_{i}_{j}')
        z = f1(x, y)
        z.backward()
        Z1[i, j] = z.data
        grad_x1[i, j] = x.grad
        grad_y1[i, j] = y.grad

        x2 = Node(X[i, j], label=f'x2_{i}_{j}')
        y2 = Node(Y[i, j], label=f'y2_{i}_{j}')
        z2 = f2(x2, y2)
        z2.backward()
        Z2[i, j] = z2.data
        grad_x2[i, j] = x2.grad
        grad_y2[i, j] = y2.grad

fig = plt.figure(figsize=(16, 12))

ax1 = fig.add_subplot(221, projection='3d')
ax1.plot_surface(X, Y, Z1, cmap='viridis')
ax1.set_title(r'Plot of $f_1(x,y) = xy + \sin(x)$')
ax1.set_xlabel('X')
ax1.set_ylabel('Y')
ax1.set_zlabel('Z')

ax2 = fig.add_subplot(222, projection='3d')
ax2.quiver(X, Y, Z1, grad_x1, grad_y1, np.zeros_like(Z1), length=0.1, normalize=True)
ax2.set_title(r'Autograd Gradients of $f_1(x,y)$')
ax2.set_xlabel('X')
ax2.set_ylabel('Y')
ax2.set_zlabel('Gradient')

ax3 = fig.add_subplot(223, projection='3d')
ax3.plot_surface(X, Y, Z2, cmap='viridis')
ax3.set_title(r'Plot of $f_2(x,y) = x^2 + y^2$')
ax3.set_xlabel('X')
ax3.set_ylabel('Y')
ax3.set_zlabel('Z')

ax4 = fig.add_subplot(224, projection='3d')
ax4.quiver(X, Y, Z2, grad_x2, grad_y2, np.zeros_like(Z2), length=0.1, normalize=True)
ax4.set_title(r'Autograd Gradients of $f_2(x,y)$')
ax4.set_xlabel('X')
ax4.set_ylabel('Y')
ax4.set_zlabel('Gradient')

plt.tight_layout()
plt.show()

Reverse Mode Automatic Differentiation¶

Why Forward Mode is Not Practical¶

Solution: Reverse Mode Automatic Differentiation¶

Computation Graph¶

Implementation in Python¶