Skip to content

Commit

Permalink
Fix memory utils organization (#2990)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #2990

X-link: facebookresearch/FBGEMM#83

- Fix memory utils organization to expose new_unified_tensor op in CPU-only mode in OSS

Reviewed By: spcyppt

Differential Revision: D61291513

fbshipit-source-id: 1b4a3582ff6240896d4cb529056c5ea7e2391a7b
  • Loading branch information
q10 authored and facebook-github-bot committed Aug 15, 2024
1 parent 7105d5f commit 6d3c2fe
Show file tree
Hide file tree
Showing 10 changed files with 98 additions and 69 deletions.
5 changes: 2 additions & 3 deletions fbgemm_gpu/FbgemmGpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,8 @@ set(fbgemm_gpu_sources_static_cpu
codegen/training/backward/embedding_backward_dense_host_cpu.cpp
codegen/utils/embedding_bounds_check_host_cpu.cpp
src/config/feature_gates.cpp
src/memory_utils/memory_utils.cpp
src/memory_utils/memory_utils_ops.cpp
src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp
src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp
Expand Down Expand Up @@ -481,9 +483,6 @@ if(NOT FBGEMM_CPU_ONLY)
codegen/utils/embedding_bounds_check_host.cpp
src/intraining_embedding_pruning_ops/intraining_embedding_pruning_gpu.cpp
src/layout_transform_ops/layout_transform_ops_gpu.cpp
src/memory_utils/memory_utils.cpp
src/memory_utils/memory_utils_ops.cpp
src/memory_utils/memory_utils_ops_cpu.cpp
src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
src/quantize_ops/quantize_ops_gpu.cpp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from itertools import accumulate
from typing import List, Optional, Tuple, Union

import fbgemm_gpu # noqa: F401
import torch # usort:skip
from torch import nn, Tensor # usort:skip

Expand Down
7 changes: 7 additions & 0 deletions fbgemm_gpu/src/memory_utils/memory_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,18 @@
*/

#include "common.h"
#include "fbgemm_gpu/cumem_utils.h"

using Tensor = at::Tensor;

namespace fbgemm_gpu {

Tensor new_managed_tensor_meta(
const Tensor& self,
const std::vector<std::int64_t>& sizes) {
return at::empty(sizes, self.options());
}

Tensor new_unified_tensor_cpu(
const Tensor& self,
const std::vector<std::int64_t>& sizes,
Expand Down
6 changes: 0 additions & 6 deletions fbgemm_gpu/src/memory_utils/memory_utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -173,12 +173,6 @@ Tensor new_managed_tensor(
return t;
}

Tensor new_managed_tensor_meta(
const Tensor& self,
const std::vector<std::int64_t>& sizes) {
return at::empty(sizes, self.options());
}

// Allocate a cuda Tensor with unified managed memory (UVM) without the
// additional steps taked by new_managed_tensor above
Tensor new_vanilla_managed_tensor(
Expand Down
24 changes: 4 additions & 20 deletions fbgemm_gpu/src/memory_utils/memory_utils_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,39 +7,23 @@
*/

#include <torch/library.h>
#include "common.cuh"
#include "common.h"
#include "fbgemm_gpu/cumem_utils.h"
#include "fbgemm_gpu/sparse_ops_utils.h"

using Tensor = at::Tensor;

namespace fbgemm_gpu {

TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
m.def("is_uvm_tensor(Tensor t) -> bool", TORCH_FN(is_uvm_tensor));
m.def("uvm_storage(Tensor t) -> bool", TORCH_FN(uvm_storage));
m.def(
"uvm_to_device(Tensor self, Tensor prototype) -> Tensor",
TORCH_FN(uvm_to_device));
m.def("uvm_to_cpu(Tensor t) -> Tensor");
m.def("new_managed_tensor(Tensor self, int[] sizes) -> Tensor");
m.def("new_host_mapped_tensor(Tensor self, int[] sizes) -> Tensor");
m.def("new_vanilla_managed_tensor(Tensor self, int[] sizes) -> Tensor");
m.def(
"cuda_mem_advise(Tensor t, int advice) -> ()",
TORCH_FN(uvm_cuda_mem_advise));
m.def(
"cuda_mem_prefetch_async(Tensor t, Tensor? device_t) -> ()",
TORCH_FN(uvm_cuda_mem_prefetch_async));
m.def(
"uvm_mem_advice_dont_fork(Tensor t) -> ()",
TORCH_FN(uvm_mem_advice_dont_fork));
"new_unified_tensor(Tensor self, int[] sizes, bool is_host_mapped) -> Tensor");

m.def("uvm_to_cpu_clone(Tensor t) -> Tensor", TORCH_FN(uvm_to_cpu_clone));
m.def(FBGEMM_GPU_ENUM_OP(uvm, fbgemm_gpu_uvm_enum_query));
}

TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
DISPATCH_TO_META("new_managed_tensor", new_managed_tensor_meta);
DISPATCH_TO_CPU("new_unified_tensor", new_unified_tensor_cpu);
}

} // namespace fbgemm_gpu
23 changes: 21 additions & 2 deletions fbgemm_gpu/src/memory_utils/memory_utils_ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,28 @@
namespace fbgemm_gpu {

TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
DISPATCH_TO_CUDA("uvm_to_cpu", uvm_to_cpu);
m.def("is_uvm_tensor(Tensor t) -> bool", TORCH_FN(is_uvm_tensor));
m.def("uvm_storage(Tensor t) -> bool", TORCH_FN(uvm_storage));
m.def(
"uvm_to_device(Tensor self, Tensor prototype) -> Tensor",
TORCH_FN(uvm_to_device));

m.def(
"cuda_mem_advise(Tensor t, int advice) -> ()",
TORCH_FN(uvm_cuda_mem_advise));
m.def(
"cuda_mem_prefetch_async(Tensor t, Tensor? device_t) -> ()",
TORCH_FN(uvm_cuda_mem_prefetch_async));
m.def(
"uvm_mem_advice_dont_fork(Tensor t) -> ()",
TORCH_FN(uvm_mem_advice_dont_fork));

m.def("uvm_to_cpu_clone(Tensor t) -> Tensor", TORCH_FN(uvm_to_cpu_clone));
m.def("uvm_to_cpu(Tensor t) -> Tensor", TORCH_FN(uvm_to_cpu));

m.def(FBGEMM_GPU_ENUM_OP(uvm, fbgemm_gpu_uvm_enum_query));

DISPATCH_TO_CUDA("new_managed_tensor", new_managed_tensor);
DISPATCH_TO_META("new_managed_tensor", new_managed_tensor_meta);
DISPATCH_TO_CUDA("new_host_mapped_tensor", new_host_mapped_tensor);
DISPATCH_TO_CUDA("new_unified_tensor", new_unified_tensor);
DISPATCH_TO_CUDA("new_vanilla_managed_tensor", new_vanilla_managed_tensor);
Expand Down
26 changes: 0 additions & 26 deletions fbgemm_gpu/src/memory_utils/memory_utils_ops_cpu.cpp

This file was deleted.

19 changes: 7 additions & 12 deletions fbgemm_gpu/test/config/feature_gate_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
# pyre-unsafe

import unittest
from contextlib import contextmanager

# pyre-fixme[21]
import fbgemm_gpu
Expand All @@ -17,21 +16,17 @@
# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
open_source: bool = getattr(fbgemm_gpu, "open_source", False)

if not open_source:
if open_source:
# pyre-ignore[21]
from test_utils import TestSuite

else:
# pyre-fixme[21]
from fbgemm_gpu.fb.config import FeatureGateName as FbFeatureGateName
from fbgemm_gpu.test.test_utils import TestSuite


class FeatureGateTest(unittest.TestCase):
@contextmanager
# pyre-ignore[2]
def assertNotRaised(self, exc_type) -> None:
try:
# pyre-ignore[7]
yield None
except exc_type as e:
raise self.failureException(e)

class FeatureGateTest(TestSuite): # pyre-ignore[11]
def test_feature_gates(self) -> None:
for feature in FeatureGateName:
# pyre-ignore[16]
Expand Down
12 changes: 12 additions & 0 deletions fbgemm_gpu/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import os
import subprocess
import unittest
from contextlib import contextmanager
from functools import wraps
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

Expand Down Expand Up @@ -164,6 +165,17 @@ def dontGenerateOpCheckTests(reason: str):
return optests.dontGenerateOpCheckTests(reason)


class TestSuite(unittest.TestCase):
@contextmanager
# pyre-ignore[2]
def assertNotRaised(self, exc_type) -> None:
try:
# pyre-ignore[7]
yield None
except exc_type as e:
raise self.failureException(e)


# Version of torch.autograd.gradcheck that works with generate_opcheck_tests.
# The problem with just torch.autograd.gradcheck is that it results in
# very slow tests when composed with generate_opcheck_tests.
Expand Down
44 changes: 44 additions & 0 deletions fbgemm_gpu/test/uvm/ops_load_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# pyre-strict
# pyre-ignore-all-errors[56]

import unittest

import fbgemm_gpu
import hypothesis.strategies as st
import torch
from hypothesis import given, settings

# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
open_source: bool = getattr(fbgemm_gpu, "open_source", False)

if open_source:
# pyre-ignore[21]
from test_utils import cpu_and_maybe_gpu, TestSuite
else:
from fbgemm_gpu.test.test_utils import cpu_and_maybe_gpu, TestSuite


class OpsLoadTest(TestSuite): # pyre-ignore[11]
@given(
device=cpu_and_maybe_gpu(),
host_mapped=st.booleans(),
)
@settings(deadline=None)
def test_cpu_ops(self, device: torch.device, host_mapped: bool) -> None:
with self.assertNotRaised(Exception): # pyre-ignore[16]
torch.ops.fbgemm.new_unified_tensor(
torch.zeros(1, device=device, dtype=torch.float),
[1000],
host_mapped,
)


if __name__ == "__main__":
unittest.main()

0 comments on commit 6d3c2fe

Please sign in to comment.