import torch x = torch.tensor([1.0,2.0,3.0], requires_grad=True) print(x) y = x+2 # a function used in backprop for calculating the gradient is created # y.retain_grad() # for getting grad of y (a non-leaf tensor) print(y) z = y*y*2 z = y.mean() print(z) z.backward() # no argument needed because z is scalar -> will calculate the gradient pretty accurately # print(y.grad) print(x.grad) z = y*y*2 print(z) # z.backward() will fail because z is not scalar -> create vector vor Jacobian-Vector product (JVP) # you have to specify the step size for the gradient approximation # (calculation via chain rule Jacobian * vector = gradient vector) vector is size of step for each element -> very small elements approximate the gradient well v = torch.tensor([0.000000001, 0.000000001, 0.000000001], dtype=torch.float32) z.backward(v) # pass vector to JVP print(x.grad) # prevent operation from being tracked by gradient tracking (requires_grad) # 3 options # 1. x.requires_grad_(False) -> turn off requires_grad completely # 2. x.detach() -> returns new tensor without requires_grad # 3. with torch.no_grad(): -> lets you do operations without grad tracking temporarily x = torch.tensor([1.0,2.0,3.0], requires_grad=True) y = x*x print(x) # 1 x.requires_grad_(False) print(x) x = torch.tensor([1.0,2.0,3.0], requires_grad=True) y = x*x print(x) # 2 z = x.detach() print(z) x = torch.tensor([1.0,2.0,3.0], requires_grad=True) y = x*x print(x) #3 with torch.no_grad(): a = x+2 print(a) b = x+2 print(b) # gradients will be summed up! -> empty gradients #this is a dummy training weights = torch.ones(4, requires_grad=True) for epoch in range(3): model_output = (weights*3).sum() model_output.backward() print(weights.grad) weights.grad.zero_()# clear gradients #later optimizer = torch.optim.SGD(weights, lr=0.01) # stochastic gradient descent optimizer.step() optimizer.zero_grad() # clear gradients # RECAP # turn on gradient tracking for interesting vectors (f(x) = x², f'(x) = ? -> requires_grad=True for x) # calculate gradient with f.backward(), specify step size for vectors (not needed for scalar functions like mean()) # clear gradients with x.grad.zero_() # prevent operations from being tracked in the comp graph with one of the 3 options above