当前位置: 首页 > article >正文

Ascend Extension for PyTorch的源码解析

1 源码下载

Ascend对pytorch代码的适配,可从以下链接中获取。
Ascend/pytorch
执行如下命令即可。

git clone https://gitee.com/ascend/pytorch.git

2 目录结构解析

源码下载后,如果需要编译torch-npu,最好保持pytorch的源码版本匹配,以及其编译环境的gcc,g++等与torch-npu的版本匹配,否则会出现各种乱起八糟的问题。

执行编译命令:bash ci/build.sh --python=3.x

如:


csrc/aten/AutoCastOps.cpp:28:70: error: macro "KERNEL_PRIVATEUSEONE" passed 3 arguments, but takes just 2
KERNEL_PRIVATEUSEONE(_convolution, deprecated, lower_precision_fp)

在torch-npu编译成功之后,通过generate_code.sh会生成如下文件:

    torch_npu/csrc/aten/ADInplaceOrViewTypeEverything.cpp
	torch_npu/csrc/aten/ADInplaceOrViewType_0.cpp
	torch_npu/csrc/aten/ADInplaceOrViewType_1.cpp
	torch_npu/csrc/aten/CustomFunctions.cpp
	torch_npu/csrc/aten/CustomFunctions.h
	torch_npu/csrc/aten/CustomRedispatch.cpp
	torch_npu/csrc/aten/CustomRedispatch.h
	torch_npu/csrc/aten/CustomRegisterSchema.cpp
	torch_npu/csrc/aten/ForeachRegister.cpp
	torch_npu/csrc/aten/Functions.cpp
	torch_npu/csrc/aten/Functions.h
	torch_npu/csrc/aten/NPUOpApiNativeFunctions.h
	torch_npu/csrc/aten/QuantizedRegister.cpp
	torch_npu/csrc/aten/RegisterFunctionalizationEverything.cpp
	torch_npu/csrc/aten/RegisterFunctionalization_0.cpp
	torch_npu/csrc/aten/RegisterFunctionalization_1.cpp
	torch_npu/csrc/aten/RegisterSparseCsrNPU.cpp
	torch_npu/csrc/aten/RegisterSparseNPU.cpp
	torch_npu/csrc/aten/VariableType.h
	torch_npu/csrc/aten/VariableTypeEverything.cpp
	torch_npu/csrc/aten/VariableType_0.cpp
	torch_npu/csrc/aten/npu_native_functions_by_codegen.yaml
	torch_npu/csrc/aten/python_functions.h
	torch_npu/csrc/aten/python_functionsEverything.cpp
	torch_npu/csrc/aten/python_functions_0.cpp
	torch_npu/csrc/aten/python_functions_1.cpp
	torch_npu/csrc/aten/variable_factories.h
	torch_npu/testing/_npu_testing_utils.py
	torch_npu/utils/custom_ops.py
	torch_npu/utils/exposed_api.py

上述文件生成路径默认的是torch_npu/csrc/aten。算子编译信息的yaml文件:torch_npu/csrc/aten/npu_native_functions.yaml

打开上述的的文件中,从中分析可知大概有3种方式实现昇腾npu算子的调用。

3. 算子注册方式

本质上,ascend上对pytroch框架的适配代码,主要是将npu上的算子库对接起来。如何对接这些算子,是一套机制的问题,本身应该不复杂。

3.1 通过torch的regsiter方式

直接调用npu的算子。torch_npu/csrc/aten/RegisterSparseNPU.cpp

TORCH_LIBRARY_IMPL(aten, SparsePrivateUse1, m) {
m.impl("abs", TORCH_FN(wrap_SparseNPU_abs_));
m.impl("abs_", TORCH_FN(wrap_SparseNPU_abs__));
m.impl("abs.out", TORCH_FN(wrap_SparseNPU_abs_out));
m.impl("sgn", TORCH_FN(wrap_SparseNPU_sgn_));
m.impl("sgn_", TORCH_FN(wrap_SparseNPU_sgn__));
m.impl("sgn.out", TORCH_FN(wrap_SparseNPU_sgn_out));

3.2 通过定义算子方式

参考文件:torch_npu/csrc/aten/CustomFunctions.cpp

#include <ATen/core/dispatch/Dispatcher.h>

#include "torch_npu/csrc/aten/CustomFunctions.h"


namespace at_npu {
namespace native {
namespace custom_ops {

int64_t npu_change_data_ptr(const at::Tensor & dst, const at::Tensor & src, int64_t index) {
    static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("npu::npu_change_data_ptr", "").typed<int64_t (const at::Tensor &, const at::Tensor &, int64_t)>();
    return op.call(dst, src, index);
}
int64_t get_npu_format(const at::Tensor & self) {
    static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("npu::get_npu_format", "").typed<int64_t (const at::Tensor &)>();
    return op.call(self);
}
at::Tensor npu_format_cast(const at::Tensor & self, const at::Tensor & dst) {
    static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("npu::npu_format_cast", "Tensor").typed<at::Tensor (const at::Tensor &, const at::Tensor &)>();
    return op.call(self, dst);
}
at::Tensor & npu_format_cast_(at::Tensor & self, int64_t acl_format) {
    static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("npu::npu_format_cast_", "acl_format").typed<at::Tensor & (at::Tensor &, int64_t)>();
    return op.call(self, acl_format);

 at::Tensor & npu_format_cast_(at::Tensor & self, const at::Tensor & src) {
    static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("npu::npu_format_cast_", "").typed<at::Tensor & (at::Tensor &, const at::Tensor &)>();
    return op.call(self, src);
}
at::Tensor empty_with_format(at::IntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory, int64_t acl_format) {
    static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("npu::empty_with_format", "").typed<at::Tensor (at::IntArrayRef, ::std::optional<at::ScalarType>, ::std::optional<at::Layout>, ::std::optional<at::Device>, ::std::optional<bool>, int64_t)>();
    return op.call(size, dtype, layout, device, pin_memory, acl_format);
}
at::Tensor unsafe_empty_with_format(at::IntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory, int64_t acl_format, bool keep_format) {
    static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("npu::unsafe_empty_with_format", "").typed<at::Tensor (at::IntArrayRef, ::std::optional<at::ScalarType>, ::std::optional<at::Layout>, ::std::optional<at::Device>, ::std::optional<bool>, int64_t, bool)>();
    return op.call(size, dtype, layout, device, pin_memory, acl_format, keep_format);
}
 ~/pytorch-ascend/torch_npu/csrc/aten/CustomFunctions.cpp[1,RO]  

...

}
}
}

3.3 通过API重定向映射的方式

参考文件:torch_npu/utils/custom_ops.py

torch_npu.npu_layer_norm_eval = torch.ops.npu.npu_layer_norm_eval
torch_npu.npu_fused_attention_score_grad = torch.ops.npu.npu_fused_attention_score_grad
torch_npu.npu_quant_conv2d = torch.ops.npu.npu_quant_conv2d
torch_npu.npu_view_copy = torch.ops.npu.npu_view_copy
torch_npu.npu_fast_gelu = torch.ops.npu.npu_fast_gelu
torch_npu.npu_fused_attention_layernorm_qkv_fwd = torch.ops.npu.npu_fused_attention_layernorm_qkv_fwd
torch_npu.npu_fast_gelu_backward = torch.ops.npu.npu_fast_gelu_backward
torch_npu.npu_bmm_v2_mat1_backward = torch.ops.npu.npu_bmm_v2_mat1_backward

以上属于个人理解,如有错误敬请指正。


http://www.kler.cn/a/387369.html

相关文章:

  • SpeingMVC框架(三)
  • 【react】使用antd Table渲染数据遇到的报错问题
  • 【Vim Masterclass 笔记11】S06L24 + L25:Vim 文本的插入、变更、替换与连接操作同步练习(含点评课)
  • ElasticSearch|ES|架构介绍|原理浅析
  • 【Qt】01-了解QT
  • 深入浅出 Android AES 加密解密:从理论到实战
  • WPF自定义翻页控件
  • 简单的TCP程序
  • RK3568笔记1:BootRom
  • 【Linux 29】传输层协议 - UDP
  • 28-在CARLA包中获取地图
  • vue之vant上传图片
  • 数据结构-归并排序笔记
  • Java 连接操作 MySQL 数据库(增删查改操作)
  • 文献阅读 | Nature Methods:使用 STAMP 对空间转录组进行可解释的空间感知降维
  • LLMs在供应链投毒检测中的应用
  • 植物明星大乱斗1
  • 利用AI工具进行论文数据收集
  • 了解GPT大模型,读这本书就够了!(文末送书)
  • 【模块化大作战】Webpack如何搞定CommonJS与ES6混战(1-3)
  • 【网络】深入理解 HTTPS:确保数据传输安全的核心协议
  • 今天要重新认识下注解@RequestBody
  • IDEA构建JavaWeb项目,并通过Tomcat成功运行
  • 【快速入门】Kafka的安装部署
  • 关于QUERY_ALL_PACKAGES权限导致Google下架apk
  • LLM大模型学习精华系列:VLLM性能优化部署实践——全面加速从推理到部署的流程