import numpy as np class Tensor(object): def __init__(self, data, autograd=False, creators=None, creation_op=None, id=None): self.data = np.array(data) self.autograd = autograd self.grad = None if (id is None): self.id = np.random.randint(0, 100000) else: self.id = id self.creators = creators self.creation_op = creation_op self.children = {} if (creators is not None): for c in creators: if (self.id not in c.children): c.children[self.id] = 1 else: c.children[self.id] += 1 def all_children_grads_accounted_for(self): for id, cnt in self.children.items(): if (cnt != 0): return False return True def backward(self, grad=None, grad_origin=None): if (self.autograd): if (grad is None): grad = Tensor(np.ones_like(self.data)) if (grad_origin is not None): if (self.children[grad_origin.id] == 0): raise Exception("cannot backprop more than once") else: self.children[grad_origin.id] -= 1 if (self.grad is None): self.grad = grad else: self.grad += grad # grads must not have grads of their own assert grad.autograd == False # only continue backpropping if there's something to # backprop into and if all gradients (from children) # are accounted for override waiting for children if # "backprop" was called on this variable directly if (self.creators is not None and (self.all_children_grads_accounted_for() or grad_origin is None)): if (self.creation_op == "add"): self.creators[0].backward(self.grad, self) self.creators[1].backward(self.grad, self) if (self.creation_op == "sub"): self.creators[0].backward(Tensor(self.grad.data), self) self.creators[1].backward(Tensor(self.grad.__neg__().data), self) if (self.creation_op == "mul"): new = self.grad * self.creators[1] self.creators[0].backward(new, self) new = self.grad * self.creators[0] self.creators[1].backward(new, self) if (self.creation_op == "mm"): c0 = self.creators[0] c1 = self.creators[1] new = self.grad.mm(c1.transpose()) c0.backward(new) new = self.grad.transpose().mm(c0).transpose() c1.backward(new) if (self.creation_op == "transpose"): self.creators[0].backward(self.grad.transpose()) if ("sum" in self.creation_op): dim = int(self.creation_op.split("_")[1]) self.creators[0].backward(self.grad.expand(dim, self.creators[0].data.shape[dim])) if ("expand" in self.creation_op): dim = int(self.creation_op.split("_")[1]) self.creators[0].backward(self.grad.sum(dim)) if (self.creation_op == "neg"): self.creators[0].backward(self.grad.__neg__()) if (self.creation_op == "sigmoid"): ones = Tensor(np.ones_like(self.grad.data)) self.creators[0].backward(self.grad * (self * (ones - self))) if (self.creation_op == "tanh"): ones = Tensor(np.ones_like(self.grad.data)) self.creators[0].backward(self.grad * (ones - (self * self))) if (self.creation_op == "index_select"): new_grad = np.zeros_like(self.creators[0].data) indices_ = self.index_select_indices.data.flatten() grad_ = grad.data.reshape(len(indices_), -1) for i in range(len(indices_)): new_grad[indices_[i]] += grad_[i] self.creators[0].backward(Tensor(new_grad)) if (self.creation_op == "cross_entropy"): dx = self.softmax_output - self.target_dist self.creators[0].backward(Tensor(dx)) def __add__(self, other): if (self.autograd and other.autograd): return Tensor(self.data + other.data, autograd=True, creators=[self, other], creation_op="add") return Tensor(self.data + other.data) def __neg__(self): if (self.autograd): return Tensor(self.data * -1, autograd=True, creators=[self], creation_op="neg") return Tensor(self.data * -1) def __sub__(self, other): if (self.autograd and other.autograd): return Tensor(self.data - other.data, autograd=True, creators=[self, other], creation_op="sub") return Tensor(self.data - other.data) def __mul__(self, other): if (self.autograd and other.autograd): return Tensor(self.data * other.data, autograd=True, creators=[self, other], creation_op="mul") return Tensor(self.data * other.data) def sum(self, dim): if (self.autograd): return Tensor(self.data.sum(dim), autograd=True, creators=[self], creation_op="sum_" + str(dim)) return Tensor(self.data.sum(dim)) def expand(self, dim, copies): trans_cmd = list(range(0, len(self.data.shape))) trans_cmd.insert(dim, len(self.data.shape)) new_data = self.data.repeat(copies).reshape(list(self.data.shape) + [copies]).transpose(trans_cmd) if (self.autograd): return Tensor(new_data, autograd=True, creators=[self], creation_op="expand_" + str(dim)) return Tensor(new_data) def transpose(self): if (self.autograd): return Tensor(self.data.transpose(), autograd=True, creators=[self], creation_op="transpose") return Tensor(self.data.transpose()) def mm(self, x): if (self.autograd): return Tensor(self.data.dot(x.data), autograd=True, creators=[self, x], creation_op="mm") return Tensor(self.data.dot(x.data)) def sigmoid(self): if (self.autograd): return Tensor(1 / (1 + np.exp(-self.data)), autograd=True, creators=[self], creation_op="sigmoid") return Tensor(1 / (1 + np.exp(-self.data))) def tanh(self): if (self.autograd): return Tensor(np.tanh(self.data), autograd=True, creators=[self], creation_op="tanh") return Tensor(np.tanh(self.data)) def index_select(self, indices): if (self.autograd): new = Tensor(self.data[indices.data], autograd=True, creators=[self], creation_op="index_select") new.index_select_indices = indices return new return Tensor(self.data[indices.data]) def cross_entropy(self, target_indices): temp = np.exp(self.data) softmax_output = temp / np.sum(temp, axis=len(self.data.shape) - 1, keepdims=True) t = target_indices.data.flatten() p = softmax_output.reshape(len(t), -1) target_dist = np.eye(p.shape[1])[t] loss = -(np.log(p) * (target_dist)).sum(1).mean() if (self.autograd): out = Tensor(loss, autograd=True, creators=[self], creation_op="cross_entropy") out.softmax_output = softmax_output out.target_dist = target_dist return out return Tensor(loss) def __repr__(self): return str(self.data.__repr__()) def __str__(self): return str(self.data.__str__()) class Layer(object): def __init__(self): self.parameters = list() def get_parameters(self): return self.parameters class Tanh(Layer): def __init__(self): super().__init__() def forward(self, input): return input.tanh() class Sigmoid(Layer): def __init__(self): super().__init__() def forward(self, input): return input.sigmoid() class CrossEntropyLoss(object): def __init__(self): super().__init__() def forward(self, input, target): return input.cross_entropy(target) class Sequential(Layer): def __init__(self, layers=list()): super().__init__() self.layers = layers def add(self, layer): self.layers.append(layer) def forward(self, input): for layer in self.layers: input = layer.forward(input) return input def get_parameters(self): params = list() for l in self.layers: params += l.get_parameters() return params class Embedding(Layer): def __init__(self, vocab_size, dim): super().__init__() self.vocab_size = vocab_size self.dim = dim # this random initialiation style is just a convention from word2vec self.weight = Tensor((np.random.rand(vocab_size, dim) - 0.5) / dim, autograd=True) self.parameters.append(self.weight) def forward(self, input): return self.weight.index_select(input) class Linear(Layer): def __init__(self, n_inputs, n_outputs): super().__init__() W = np.random.randn(n_inputs, n_outputs) * np.sqrt(2.0 / (n_inputs)) self.weight = Tensor(W, autograd=True) self.bias = Tensor(np.zeros(n_outputs), autograd=True) self.parameters.append(self.weight) self.parameters.append(self.bias) def forward(self, input): return input.mm(self.weight) + self.bias.expand(0, len(input.data)) class MSELoss(Layer): def __init__(self): super().__init__() def forward(self, pred, target): return ((pred - target) * (pred - target)).sum(0) class SGD(object): def __init__(self, parameters, alpha=0.1): self.parameters = parameters self.alpha = alpha def zero(self): for p in self.parameters: p.grad.data *= 0 def step(self, zero=True): for p in self.parameters: p.data -= p.grad.data * self.alpha if (zero): p.grad.data *= 0 import numpy np.random.seed(0) # data indices data = Tensor(np.array([1, 2, 1, 2]), autograd=True) # target indices target = Tensor(np.array([0, 1, 0, 1]), autograd=True) model = Sequential([Embedding(3, 3), Tanh(), Linear(3, 4)]) criterion = CrossEntropyLoss() optim = SGD(parameters=model.get_parameters(), alpha=0.1) for i in range(10): # Predict pred = model.forward(data) # Compare loss = criterion.forward(pred, target) # Learn loss.backward(Tensor(np.ones_like(loss.data))) optim.step() print(loss) ''' 1.3885032434928422 0.9558181509266037 0.6823083585795604 0.5095259967493119 0.39574491472895856 0.31752527285348264 0.2617222861964216 0.22061283923954234 0.18946427334830068 0.16527389263866668 '''