起因:
https://www.hiascend.com/document/detail/zh/Atlas%20200I%20A2/23.0.0/re/npu/npusmi_013.html
npu-smi info -l
查询所有NPU设备:
[naie@notebook-npu-bd130045-55bbffd786-lr6t8 DCNN]$ npu-smi info -l
Total Count : 1
NPU ID : 6
Chip Count : 1
运行脚本:
import torch_npu
from torch_npu.contrib import transfer_to_npu
import torch
import torch
import torch.nn as nn
class SingleConv(nn.Module):
def __init__(self, in_ch, out_ch, kernel_size, stride, padding):
super(SingleConv, self).__init__()
self.single_conv = nn.Sequential(
nn.Conv2d(in_ch, out_ch, kernel_size=kernel_size, padding=padding, stride=stride, bias=False),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True)
)
def forward(self, x):
return self.single_conv(x)
class DenseFeaureAggregation(nn.Module):
def __init__(self, in_ch, out_ch, base_ch):
super(DenseFeaureAggregation, self).__init__()
self.conv1 = nn.Sequential(
nn.BatchNorm2d(num_features=1 * in_ch, eps=1e-5, affine=True),
nn.ReLU(inplace=True),
nn.Conv2d(in_ch, base_ch, dilation=2, kernel_size=3, padding=2, stride=1, bias=True),
)
self.conv2 = nn.Sequential(
nn.BatchNorm2d(num_features=in_ch + base_ch, eps=1e-5, affine=True),
nn.ReLU(inplace=True),
nn.Conv2d(in_ch + base_ch, base_ch, dilation=3, kernel_size=3, padding=3, stride=1, bias=True),
)
self.conv3 = nn.Sequential(
nn.BatchNorm2d(num_features=in_ch + 2 * base_ch, eps=1e-5, affine=True),
nn.ReLU(inplace=True),
nn.Conv2d(in_ch + 2 * base_ch, base_ch, dilation=5, kernel_size=3, padding=5, stride=1, bias=True),
)
self.conv4 = nn.Sequential(
nn.BatchNorm2d(num_features=in_ch + 3 * base_ch, eps=1e-5, affine=True),
nn.ReLU(inplace=True),
nn.Conv2d(in_ch + 3 * base_ch, base_ch, dilation=7, kernel_size=3, padding=7, stride=1, bias=True),
)
self.conv5 = nn.Sequential(
nn.BatchNorm2d(num_features=in_ch + 4 * base_ch, eps=1e-5, affine=True),
nn.ReLU(inplace=True),
nn.Conv2d(in_ch + 4 * base_ch, base_ch, dilation=9, kernel_size=3, padding=9, stride=1, bias=True),
)
self.conv_out = nn.Sequential(
nn.BatchNorm2d(num_features=in_ch + 5 * base_ch, eps=1e-5, affine=True),
nn.ReLU(inplace=True),
nn.Conv2d(in_ch + 5 * base_ch, out_ch, dilation=1, kernel_size=1, padding=0, stride=1, bias=True),
)
def forward(self, x):
out_ = self.conv1(x)
concat_ = torch.cat((out_, x), dim=1)
out_ = self.conv2(concat_)
concat_ = torch.cat((concat_, out_), dim=1)
out_ = self.conv3(concat_)
concat_ = torch.cat((concat_, out_), dim=1)
out_ = self.conv4(concat_)
concat_ = torch.cat((concat_, out_), dim=1)
out_ = self.conv5(concat_)
concat_ = torch.cat((concat_, out_), dim=1)
out_ = self.conv_out(concat_)
return out_
class Encoder(nn.Module):
def __init__(self, in_ch, list_ch):
super(Encoder, self).__init__()
self.encoder_1 = nn.Sequential(
SingleConv(in_ch, list_ch[1], kernel_size=3, stride=1, padding=1),
SingleConv(list_ch[1], list_ch[1], kernel_size=3, stride=1, padding=1)
)
self.encoder_2 = nn.Sequential(
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
SingleConv(list_ch[1], list_ch[2], kernel_size=3, stride=1, padding=1),
SingleConv(list_ch[2], list_ch[2], kernel_size=3, stride=1, padding=1)
)
self.encoder_3 = nn.Sequential(
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
SingleConv(list_ch[2], list_ch[3], kernel_size=3, stride=1, padding=1),
SingleConv(list_ch[3], list_ch[3], kernel_size=3, stride=1, padding=1)
)
self.encoder_4 = nn.Sequential(
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
SingleConv(list_ch[3], list_ch[4], kernel_size=3, stride=1, padding=1),
SingleConv(list_ch[4], list_ch[4], kernel_size=3, stride=1, padding=1)
)
self.DFA = DenseFeaureAggregation(list_ch[4], list_ch[4], list_ch[4])
def forward(self, x):
out_encoder_1 = self.encoder_1(x)
out_encoder_2 = self.encoder_2(out_encoder_1)
out_encoder_3 = self.encoder_3(out_encoder_2)
out_encoder_4 = self.encoder_4(out_encoder_3)
out_encoder_4 = self.DFA(out_encoder_4)
return [out_encoder_1, out_encoder_2, out_encoder_3, out_encoder_4]
class Decoder(nn.Module):
def __init__(self, out_ch, list_ch):
super(Decoder, self).__init__()
self.upconv_3_1 = nn.ConvTranspose2d(list_ch[4], list_ch[3], kernel_size=2, stride=2, bias=True)
self.decoder_conv_3_1 = nn.Sequential(
SingleConv(2 * list_ch[3], list_ch[3], kernel_size=3, stride=1, padding=1),
SingleConv(list_ch[3], list_ch[3], kernel_size=3, stride=1, padding=1)
)
self.upconv_2_1 = nn.ConvTranspose2d(list_ch[3], list_ch[2], kernel_size=2, stride=2, bias=True)
self.decoder_conv_2_1 = nn.Sequential(
SingleConv(2 * list_ch[2], list_ch[2], kernel_size=3, stride=1, padding=1),
SingleConv(list_ch[2], list_ch[2], kernel_size=3, stride=1, padding=1)
)
self.upconv_1_1 = nn.ConvTranspose2d(list_ch[2], list_ch[1], kernel_size=2, stride=2, bias=True)
self.decoder_conv_1_1 = nn.Sequential(
SingleConv(2 * list_ch[1], list_ch[1], kernel_size=3, stride=1, padding=1),
SingleConv(list_ch[1], list_ch[1], kernel_size=3, stride=1, padding=1)
)
self.conv_out = nn.Sequential(
nn.Conv2d(list_ch[1], out_ch, kernel_size=1, padding=0, bias=True)
)
def forward(self, out_encoder):
out_encoder_1, out_encoder_2, out_encoder_3, out_encoder_4 = out_encoder
out_decoder_3_1 = self.decoder_conv_3_1(
torch.cat((self.upconv_3_1(out_encoder_4), out_encoder_3), dim=1)
)
out_decoder_2_1 = self.decoder_conv_2_1(
torch.cat((self.upconv_2_1(out_decoder_3_1), out_encoder_2), dim=1)
)
out_decoder_1_1 = self.decoder_conv_1_1(
torch.cat((self.upconv_1_1(out_decoder_2_1), out_encoder_1), dim=1)
)
output = self.conv_out(out_decoder_1_1)
return [output]
class Model(nn.Module):
def __init__(self, in_ch, out_ch, list_ch):
super(Model, self).__init__()
self.encoder = Encoder(in_ch, list_ch)
self.decoder = Decoder(out_ch, list_ch)
# init
self.initialize()
@staticmethod
def init_conv_deconv_BN(modules):
for m in modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0.)
elif isinstance(m, nn.ConvTranspose2d):
nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0.)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1.)
nn.init.constant_(m.bias, 0.)
def initialize(self):
print('# random init encoder weight using nn.init.kaiming_uniform !')
self.init_conv_deconv_BN(self.decoder.modules)
print('# random init decoder weight using nn.init.kaiming_uniform !')
self.init_conv_deconv_BN(self.encoder.modules)
def forward(self, x):
out_encoder = self.encoder(x)
out_decoder = self.decoder(out_encoder) # is a list
return out_decoder
import re
import subprocess
def get_npu_id():
try:
# 执行命令并捕获输出
output = subprocess.check_output(['npu-smi', 'info', '-l'], text=True)
# 使用正则表达式查找NPU ID
match = re.search(r'NPU ID\s+:\s+(\d+)', output)
if match:
return match.group(1) # 返回匹配的第一个组,即NPU ID
else:
return "NPU ID not found"
except subprocess.CalledProcessError as e:
return f"An error occurred: {e}"
network = Model(in_ch=4, out_ch=1,
list_ch=[-1, 32, 64, 128, 256])
npu_id = get_npu_id()
# list_GPU_ids = [npu_id]
device = torch.device('cuda:' + str(npu_id))
network.to(device)
print("device:",npu_id)
报错:
Traceback (most recent call last):
File "/home/work/user-job-dir/app/notebook/RTDosePrediction-main/RTDosePrediction/Src/DCNN/test_device_id.py", line 211, in <module>
network.to(device)
File "/home/naie/.local/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py", line 56, in decorated
return fn(*args, **kwargs)
File "/home/naie/.local/lib/python3.9/site-packages/torch_npu/utils/module.py", line 68, in to
return self._apply(convert)
File "/home/naie/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 810, in _apply
module._apply(fn)
File "/home/naie/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 810, in _apply
module._apply(fn)
File "/home/naie/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 810, in _apply
module._apply(fn)
[Previous line repeated 2 more times]
File "/home/naie/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 833, in _apply
param_applied = fn(param)
File "/home/naie/.local/lib/python3.9/site-packages/torch_npu/utils/module.py", line 66, in convert
return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
File "/home/naie/.local/lib/python3.9/site-packages/torch_npu/contrib/transfer_to_npu.py", line 56, in decorated
return fn(*args, **kwargs)
RuntimeError: exchangeDevice:torch_npu/csrc/aten/common/CopyKernel.cpp:37 NPU error, error code is 107001
[ERROR] 2024-12-13-10:47:03 (PID:38196, Device:0, RankID:-1) ERR00100 PTA call acl api failed
[Error]: Invalid device ID.
Check whether the device ID is valid.
EE1001: 2024-12-13-10:47:03.815.272 The argument is invalid.Reason: Set device failed, invalid device, set device=6, valid device range is [0, 1)
Solution: 1.Check the input parameter range of the function. 2.Check the function invocation relationship.
TraceBack (most recent call last):
rtSetDevice execute failed, reason=[device id error][FUNC:FuncErrorReason][FILE:error_message_manage.cc][LINE:53]
open device 6 failed, runtime result = 107001.[FUNC:ReportCallError][FILE:log_inner.cpp][LINE:161]
猜想也许是进行了从物理ID到逻辑ID的映射。
查了一下华为的官方文档:
https://www.hiascend.com/document/detail/zh/Atlas%20200I%20A2/23.0.0/re/npu/npusmi_013.html
还真的存在这么一个映射。
遂用这个命令查看了当前环境下的芯片映射关系:
[naie@notebook-npu-bd130045-55bbffd786-lr6t8 DCNN]$ npu-smi info -m
NPU ID Chip ID Chip Logic ID Chip Name
6 0 0 Ascend 910B3
6 1 - Mcu
确实物理ID为6的NPU被映射成了0。这是因为当前环境下(notebook)中只存在一个NPU。
但是还有一个问题:什么时候使用物理ID什么时候使用逻辑ID呢?
物理ID
npu-smi info -t power -i id
类似于这种命令里使用的id都是物理ID:
因为npu-smi info -l查出来的是物理ID。
逻辑ID
device = torch.device('cuda:' + str(npu_id))
这种就用的是逻辑ID