编程模型和语言层

定义了一个自定义的PyTorch操作，并将其导出为ONNX格式。

自定义函数：MyAddFunction 继承自 torch.autograd.Function，包含两个主要静态方法：forward(ctx, a, b)：调用外部库函数 my_lib.my_add 来计算两个张量 a 和 b 的加法。symbolic(g, a, b)：为ONNX定义操作的符号表示，创建一个图节点用于将 a 乘以 2，然后加上 b。
MyAdd 是 torch.nn.Module 的子类，使用 MyAddFunction。在其 forward 方法中调用 my_add。
生成一个形状为 (1, 3, 10, 10) 的随机输入张量。
使用 torch.onnx.export 将模型导出为名为 my_add.onnx 的ONNX文件，传入相同的输入张量作为两个参数。
加载ONNX模型，并使用相同的输入张量进行推理，输出结果存储在 ort_output 中。
通过断言检查PyTorch模型的输出与ONNX模型的输出是否接近，确保导出和操作的正确性。

import torch
import my_lib
class MyAddFunction(torch.autograd.Function):

    @staticmethod
    def forward(ctx, a, b):
        return my_lib.my_add(a, b)

    @staticmethod
    def symbolic(g, a, b):
        two = g.op("Constant", value_t=torch.tensor([2]))
        a = g.op('Mul', a, two)
        return g.op('Add', a, b)

my_add = MyAddFunction.apply

class MyAdd(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, a, b):
        return my_add(a, b)

model = MyAdd()
input = torch.rand(1, 3, 10, 10)
torch.onnx.export(model, (input, input), 'my_add.onnx')
torch_output = model(input, input).detach().numpy()

import onnxruntime
import numpy as np
sess = onnxruntime.InferenceSession('my_add.onnx')
ort_output = sess.run(None, {'a': input.numpy(), 'b': input.numpy()})[0]

assert np.allclose(torch_output, ort_output)

演示如何使用 PyTorch 和 ONNX Runtime 在不同设备（如 CPU 或 GPU）上进行推理。具体来说，它通过以下步骤展示了如何使用 ONNX Runtime 来运行一个简单的加法模型（两个张量相加），并使用不同的方式将数据传递到设备上进行计算。

模型定义了一个简单的加法运算，它接受两个输入张量 x 和 y，返回它们的加法结果。创建并导出模型为 ONNX 格式，其中 x 和 y 的大小是动态的。
根据当前设备是否支持 CUDA（运行在NVIDIA GPU），创建一个 ONNX Runtime 会话，可以在 CPU 或 GPU 上运行模型。
在 CPU 上运行模型，输入和输出都是 NumPy 数组。使用 PyTorch 张量运行模型，在设备上使用 PyTorch 张量进行推理。
在main函数中，第一个调用 run()，输入 x=[1.0, 2.0, 3.0]，y=[4.0, 5.0, 6.0]，输出 z=[5.0, 7.0, 9.0]。第二个调用 run_with_data_on_device()，输入 x=[1.0, 2.0, 3.0, 4.0, 5.0] 和 y=[1.0, 2.0, 3.0, 4.0, 5.0]，输出 z=[2.0, 4.0, 6.0, 8.0, 10.0]。第三个调用 run_with_torch_tensors_on_device()，生成两个随机的 PyTorch 张量，并返回加法结果，如 [0.7023, 1.3127, 1.7289, 0.3982, 0.8386]。最后一个调用也是 run_with_torch_tensors_on_device()，但这次使用 torch.int64 类型张量，输入 x=ones(5) 和 y=zeros(5)，输出 [1, 1, 1, 1, 1]

import numpy as np
import torch
import onnxruntime

MODEL_FILE = '.model.onnx'
DEVICE_NAME = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE_INDEX = 0     # Replace this with the index of the device you want to run on
DEVICE=f'{DEVICE_NAME}:{DEVICE_INDEX}'

# A simple model to calculate addition of two tensors
def model():
    class Model(torch.nn.Module):
        def __init__(self):
            super(Model, self).__init__()

        def forward(self, x, y):
            return x.add(y)

    return Model()

# Create an instance of the model and export it to ONNX graph format, with dynamic size for the data
def create_model(type: torch.dtype = torch.float32):
    sample_x = torch.ones(3, dtype=type)
    sample_y = torch.zeros(3, dtype=type)

    torch.onnx.export(model(), (sample_x, sample_y), MODEL_FILE, input_names=["x", "y"], output_names=["z"],
                               dynamic_axes={"x": {0 : "array_length_x"}, "y": {0: "array_length_y"}})
 
# Create an ONNX Runtime session with the provided model
def create_session(model: str) -> onnxruntime.InferenceSession:
    providers = ['CPUExecutionProvider']
    if torch.cuda.is_available():
        providers.insert(0, 'CUDAExecutionProvider')
    return onnxruntime.InferenceSession(model, providers=providers)

# Run the model on CPU consuming and producing numpy arrays 
def run(x: np.array, y: np.array) -> np.array:
    session = create_session(MODEL_FILE)

    z = session.run(["z"], {"x": x, "y": y})
    
    return z[0]   

# Run the model on device consuming and producing ORTValues
def run_with_data_on_device(x: np.array, y: np.array) -> onnxruntime.OrtValue:
    session = create_session(MODEL_FILE)

    x_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(x, DEVICE_NAME, DEVICE_INDEX)
    y_ortvalue = onnxruntime.OrtValue.ortvalue_from_numpy(y, DEVICE_NAME, DEVICE_INDEX)

    io_binding = session.io_binding()
    io_binding.bind_input(name='x', device_type=x_ortvalue.device_name(), device_id=0, element_type=x.dtype, shape=x_ortvalue.shape(), buffer_ptr=x_ortvalue.data_ptr())
    io_binding.bind_input(name='y', device_type=y_ortvalue.device_name(), device_id=0, element_type=y.dtype, shape=y_ortvalue.shape(), buffer_ptr=y_ortvalue.data_ptr())
    io_binding.bind_output(name='z', device_type=DEVICE_NAME, device_id=DEVICE_INDEX, element_type=x.dtype, shape=x_ortvalue.shape())
    session.run_with_iobinding(io_binding)

    z = io_binding.get_outputs()

    return z[0]

# Run the model on device consuming and producing native PyTorch tensors
def run_with_torch_tensors_on_device(x: torch.Tensor, y: torch.Tensor, np_type: np.dtype = np.float32, torch_type: torch.dtype = torch.float32) -> torch.Tensor:
    session = create_session(MODEL_FILE)

    binding = session.io_binding()

    x_tensor = x.contiguous()
    y_tensor = y.contiguous()

    binding.bind_input(
        name='x',
        device_type=DEVICE_NAME,
        device_id=DEVICE_INDEX,
        element_type=np_type,
        shape=tuple(x_tensor.shape),
        buffer_ptr=x_tensor.data_ptr(),
        )

    binding.bind_input(
        name='y',
        device_type=DEVICE_NAME,
        device_id=DEVICE_INDEX,
        element_type=np_type,
        shape=tuple(y_tensor.shape),
        buffer_ptr=y_tensor.data_ptr(),
        )

    ## Allocate the PyTorch tensor for the model output
    z_tensor = torch.empty(x_tensor.shape, dtype=torch_type, device=DEVICE).contiguous()
    binding.bind_output(
        name='z',
        device_type=DEVICE_NAME,
        device_id=DEVICE_INDEX,
        element_type=np_type,
        shape=tuple(z_tensor.shape),
        buffer_ptr=z_tensor.data_ptr(),
    )

    session.run_with_iobinding(binding)

    return z_tensor


def main():
    create_model()

    print(run(x=np.float32([1.0, 2.0, 3.0]),y=np.float32([4.0, 5.0, 6.0])))
    # [array([5., 7., 9.], dtype=float32)]

    print(run_with_data_on_device(x=np.float32([1.0, 2.0, 3.0, 4.0, 5.0]), y=np.float32([1.0, 2.0, 3.0, 4.0, 5.0])).numpy())
    # [ 2.  4.  6.  8. 10.]

    print(run_with_torch_tensors_on_device(torch.rand(5).to(DEVICE), torch.rand(5).to(DEVICE)))
    # tensor([0.7023, 1.3127, 1.7289, 0.3982, 0.8386])

    create_model(torch.int64)
 
    print(run_with_torch_tensors_on_device(torch.ones(5, dtype=torch.int64).to(DEVICE), torch.zeros(5, dtype=torch.int64).to(DEVICE), np_type=np.int64, torch_type=torch.int64))
    # tensor([1, 1, 1, 1, 1])


if __name__ == "__main__":
    main()

可以使用C++和ONNX Runtime来实现类似的加法操作。以下是一个简单的C++示例，它演示了如何使用ONNX Runtime来加载一个简单的加法模型，并运行推理。使用PyTorch创建一个简单的加法模型并将其导出为ONNX格式：

import torch

class SimpleAddModel(torch.nn.Module):
    def forward(self, x, y):
        return x + y

# 创建并导出模型
model = SimpleAddModel()
x = torch.randn(3, dtype=torch.float32)
y = torch.randn(3, dtype=torch.float32)
torch.onnx.export(model, (x, y), "simple_add.onnx", input_names=['x', 'y'], output_names=['z'])

这段代码将创建一个简单的模型，将两个输入张量 x 和 y 相加，并导出为 simple_add.onnx。编写C++代码，使用ONNX Runtime加载和运行该模型。

#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
#include <iostream>
#include <vector>
#include <assert.h>

int main() {
    // Initialize ONNX Runtime environment
    Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "SimpleAdd");

    // Create ONNX Runtime session options
    Ort::SessionOptions session_options;
    session_options.SetIntraOpNumThreads(1);

    // Use GPU (CUDA) if available, otherwise fallback to CPU
    const char* cuda_provider = "CUDAExecutionProvider";
    if (Ort::GetAvailableProviders().count(cuda_provider)) {
        session_options.AppendExecutionProvider_CUDA(0);  // Device ID 0 for the first GPU
    } else {
        std::cout << "CUDA provider not available, running on CPU." << std::endl;
    }

    // Load the ONNX model
    const char* model_path = "simple_add.onnx";
    Ort::Session session(env, model_path, session_options);

    // Get model input/output details
    Ort::AllocatorWithDefaultOptions allocator;

    // Get the name and shape of the first input tensor ('x')
    char* input_name_x = session.GetInputName(0, allocator);
    Ort::TypeInfo input_type_info_x = session.GetInputTypeInfo(0);
    auto input_tensor_info_x = input_type_info_x.GetTensorTypeAndShapeInfo();
    std::vector<int64_t> input_shape_x = input_tensor_info_x.GetShape();

    // Get the name and shape of the second input tensor ('y')
    char* input_name_y = session.GetInputName(1, allocator);
    Ort::TypeInfo input_type_info_y = session.GetInputTypeInfo(1);
    auto input_tensor_info_y = input_type_info_y.GetTensorTypeAndShapeInfo();
    std::vector<int64_t> input_shape_y = input_tensor_info_y.GetShape();

    // Create input data (example: 3-element float vectors)
    std::vector<float> input_data_x = {1.0f, 2.0f, 3.0f};
    std::vector<float> input_data_y = {4.0f, 5.0f, 6.0f};

    // Create input tensor objects for 'x' and 'y'
    auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
    Ort::Value input_tensor_x = Ort::Value::CreateTensor<float>(memory_info, input_data_x.data(), input_data_x.size(), input_shape_x.data(), input_shape_x.size());
    Ort::Value input_tensor_y = Ort::Value::CreateTensor<float>(memory_info, input_data_y.data(), input_data_y.size(), input_shape_y.data(), input_shape_y.size());

    // Prepare input and output names
    const char* input_names[] = {input_name_x, input_name_y};
    const char* output_names[] = {"z"};

    // Run inference
    auto output_tensors = session.Run(Ort::RunOptions{nullptr}, input_names, &input_tensor_x, 2, output_names, 1);

    // Get output tensor and data
    float* output_data = output_tensors[0].GetTensorMutableData<float>();

    // Print the output results
    std::cout << "Output (z): ";
    for (size_t i = 0; i < input_data_x.size(); i++) {
        std::cout << output_data[i] << " ";
    }
    std::cout << std::endl;

    // Clean up
    allocator.Free(input_name_x);
    allocator.Free(input_name_y);

    return 0;
}

环境初始化：首先使用 Ort::Env 初始化 ONNX Runtime 环境，并指定日志级别为 ORT_LOGGING_LEVEL_WARNING。
加载模型：使用 Ort::Session 加载导出的 simple_add.onnx 模型。
输入/输出信息：通过调用 GetInputName() 和 GetInputTypeInfo() 获- 取输入和输出的名称和形状。这里假设输入 x 和 y 的形状为 [3]，即长度为3的一维张量。
创建输入张量：使用 Ort::Value::CreateTensor 创建包含输入数据的张量，这里是长度为3的浮点数数组。
运行推理：通过 session.Run() 执行模型推理，并获取输出张量。
输出结果：输出结果将存储在 output_data 中，最后我们将其打印到控制台。结果

Output (z): 5 7 9

AI技术栈解析及应用- 作者：张真瑜 | 山东大学智能创新研究院

编程模型和语言层