


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from pathlib import Path
import torch
import torch.nn as nn
import torch.profiler
from torch._inductor import aoti_compile_and_package, aoti_load_package
from torch._subclasses.fake_tensor import FakeTensorMode
class MLP(nn.Module):
"""MLP configurable across CPU, GPU, or a CPU-GPU hybrid split.
fc1 (+ GELU) is placed on *fc1_device*; fc2 is placed on *fc2_device*.
When the two devices differ the forward pass inserts an explicit device
transfer, preserved as an aten._to_copy node in the exported graph.
When they are the same the transfer is a no-op.
"""
def __init__(
self,
in_features: int,
hidden_features: int,
out_features: int,
fc1_device: torch.device = torch.device("cpu"),
fc2_device: torch.device = torch.device("cpu")
) -> None:
super().__init__()
with torch.device(fc1_device):
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = nn.GELU()
with torch.device(fc2_device):
self.fc2 = nn.Linear(hidden_features, out_features)
def forward(self, x: torch.Tensor) -> torch.Tensor:
h = self.act(self.fc1(x))
h = h.to(self.fc2.weight.device)
return self.fc2(h)
def aoti_compile(model: nn.Module, x: torch.Tensor,
package_path: str) -> object:
"""Export and AOTInductor-compile a model.
A fake input with the same shape/dtype/device as x is used so that
torch.export can trace the graph without allocating real activation memory.
Works for any device (cpu, cuda) and any model topology.
"""
with FakeTensorMode():
fake_input = torch.empty(x.shape, dtype=x.dtype, device=x.device)
ep = torch.export.export(model, (fake_input, ))
compiled_package = aoti_compile_and_package(ep, package_path=package_path)
return aoti_load_package(compiled_package, run_single_threaded=True)
def profile_runner(runner,
x: torch.Tensor,
trace_path: str,
label: str,
warmup: int = 3,
steps: int = 5) -> None:
"""Profile an AOTI runner and export a Chrome trace to *trace_path*.
Note: AOTI runners call compiled C++ directly, bypassing the ATen
dispatcher's profiling hooks. As a result, no cpu_op events (e.g.
aten::mm, aten::gelu) appear in the trace — the runner executes as an
opaque native call from the profiler's perspective. What the trace does
capture are CUDA runtime events (cudaLaunchKernel) and, when CUPTI is
available, actual GPU kernel execution on the device timeline.
"""
activities = [
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
]
schedule = torch.profiler.schedule(wait=0,
warmup=warmup,
active=steps,
repeat=1)
with torch.profiler.profile(
activities=activities,
schedule=schedule,
record_shapes=True,
with_flops=True,
) as prof:
for step in range(warmup + steps):
with torch.profiler.record_function(f"step_{step}"):
runner(x)
prof.step()
prof.export_chrome_trace(trace_path)
print(f"{label} trace written to {trace_path}")
if __name__ == "__main__":
cpu_device = torch.device("cpu")
gpu_device = torch.device("cuda")
artifacts_dir = Path(__file__).parent / "aoti_artifacts"
artifacts_dir.mkdir(exist_ok=True)
model_cpu = MLP(128, 256, 10, fc1_device=cpu_device,
fc2_device=cpu_device).eval()
x_cpu = torch.randn(4, 128, device=cpu_device)
runner_cpu = aoti_compile(model_cpu, x_cpu, str(artifacts_dir / "cpu.pt2"))
torch.testing.assert_close(runner_cpu(x_cpu), model_cpu(x_cpu))
print("AOTInductor compile (CPU) succeeded.")
profile_runner(runner_cpu, x_cpu, str(artifacts_dir / "cpu_trace.json"),
"AOTInductor (CPU)")
model_gpu = MLP(128, 256, 10, fc1_device=gpu_device,
fc2_device=gpu_device).eval()
x_cuda = torch.randn(4, 128, device=gpu_device)
runner_gpu = aoti_compile(model_gpu, x_cuda,
str(artifacts_dir / "cuda.pt2"))
torch.testing.assert_close(runner_gpu(x_cuda), model_gpu(x_cuda))
print("AOTInductor compile (GPU) succeeded.")
profile_runner(runner_gpu, x_cuda, str(artifacts_dir / "cuda_trace.json"),
"AOTInductor (GPU)")
model_hybrid = MLP(128,
256,
10,
fc1_device=cpu_device,
fc2_device=gpu_device).eval()
x_hybrid = torch.randn(4, 128, device=cpu_device)
runner_hybrid = aoti_compile(model_hybrid, x_hybrid,
str(artifacts_dir / "hybrid.pt2"))
torch.testing.assert_close(runner_hybrid(x_hybrid), model_hybrid(x_hybrid))
print("AOTInductor compile (CPU-GPU hybrid) succeeded.")
profile_runner(runner_hybrid, x_hybrid,
str(artifacts_dir / "hybrid_trace.json"),
"AOTInductor (CPU-GPU hybrid)")
此内容由惯性聚合(RSS阅读器)自动聚合整理,仅供阅读参考。 原文来自 — 版权归原作者所有。