# multihead self-attention layer
# mb = 1, num_heads = 12, hidden_size = 1024, t_x = t_y = 128
12x128x64:12x64x128_n"encoder:QK_matmul:12"
12x128x128:12x128x64_n"encoder:WV_matmul:12"

# mb = 128, num_heads = 12, hidden_size = 768, t_x = t_y = 128,
1536x128x64:1536x64x128_n"encoder:QK_matmul:12"
1536x128x128:1536x128x64_n"encoder:WV_matmul:12"

# mb = 128, num_heads = 16, hidden_size = 1024, t_x = t_y = 128,
#2048x128x64:2048x64x128_n"encoder:QK_matmul:24"
#2048x128x128:2048x128x64_n"encoder:WV_matmul:24"
