OpenBMB · JerryYin777 · Aug 2, 2023 · Aug 2, 2023 · Aug 2, 2023 · Aug 2, 2023
diff --git a/csrc/cuda/adam_cuda.cu b/csrc/cuda/adam_cuda.cu
@@ -54,8 +54,8 @@ __global__ void adam_fp32_accum_bf16(
     int32_t col = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (col < n) {
-        float local_g = __nv_bfloat162float(g[col]) / scale; // real_g
-        float local_m = beta1 * __nv_bfloat162float(m[col]) + (1 - beta1) * local_g; // real_m
+        float local_g = __bfloat162float(g[col]) / scale; // real_g
+        float local_m = beta1 * __bfloat162float(m[col]) + (1 - beta1) * local_g; // real_m
         float local_v = beta2 * v[col] + (1 - beta2) * local_g * local_g; // real_v
         float local_p = param[col];
         local_p = local_p - lr * local_m / bias_correction1 / (sqrtf(local_v / bias_correction2 / scale) + eps) - lr * weight_decay * local_p; 
@@ -122,4 +122,4 @@ void adam_bf16_launcher(
     dim3 block_size = dim3(threads, 1, 1);
     dim3 grid_size = dim3((n + threads - 1) / threads, 1, 1);
     adam_fp32_accum_bf16<<<grid_size, block_size, 0, reinterpret_cast<cudaStream_t>(stream)>>>(n, g_ptr, m_ptr, v_fp32_ptr, param_fp32_ptr, param_h_ptr, beta1, beta2, eps, lr, scale, weight_decay, bias_correction1, bias_correction2);
-}
+}
diff --git a/other_requirements.txt b/other_requirements.txt
@@ -3,4 +3,5 @@ cpm_kernels>=1.0.11
 jieba
 tensorboard
 setuptools_rust
-transformers
+transformers
+pybind11
diff --git a/tests/test_optim_bf16.py b/tests/test_optim_bf16.py
@@ -34,7 +34,7 @@ def main():
 
     model1 = model1.cuda().to(dtype=torch.bfloat16)
     model2 = model2.cuda().to(dtype=torch.bfloat16)
-    model3 = model3.cuda().to(dtype=torch.bfloat16)
+    model3 = model3.cuda()
 
     opt1 = bmt.optim.AdamOptimizer(model1.parameters(), weight_decay=1e-3)
     opt2 = bmt.optim.AdamOffloadOptimizer(model2.parameters(), weight_decay=1e-3)
@@ -46,10 +46,10 @@ def main():
         opt3.zero_grad()
 
         for p1, p2, p3 in zip(model1.parameters(), model2.parameters(), model3.parameters()):
-            grad_bf16 = torch.randn_like(p1).to(dtype=torch.bfloat16) 
-            p1.grad = grad_bf16
-            p2.grad = grad_bf16
-            p3.grad = grad_bf16
+            grad = torch.randn_like(p1)
+            p1.grad = grad.to(dtype=torch.bfloat16)
+            p2.grad = grad.to(dtype=torch.bfloat16)
+            p3.grad = grad.float()
 
         opt1.step()
         opt2.step()