查看cuda版本:nvcc -V 显示我的版本是10.2版本
查看cudnn版本:cat /usr/local/cuda/include/cudnn.h | grep CUDNN_MAJOR -A 2
显示我的cudnn版本为7.5.0版本
在官网下载trt安装包:根据系统、cuda、cudnn下载,下载地址:
https://developer.nvidia.cn/nvidia-tensorrt-8x-download
由于我需要下载8.x,要求cudnn的最低版本为8.2,所以去下载cudnn。
https://developer.nvidia.com/rdp/cudnn-archive
cd TensorRT-8.0.1.6/python/
pip install tensorrt-8.0.1.6-cp37-none-linux_x86_64.whl # 根据python版本安装
添加依赖环境cudnn:
export LD_LIBRARY_PATH=/home/work/guopei/cuda/lib64: L D L I B R A R Y P A T H 添 加 依 赖 环 境 t e n s o r r t : e x p o r t L D L I B R A R Y P A T H = LD_LIBRARY_PATH 添加依赖环境tensorrt: export LD_LIBRARY_PATH= LDLIBRARYPATH添加依赖环境tensorrt:exportLDLIBRARYPATH=LD_LIBRARY_PATH:/home/work/guopei/TensorRT-8.0.1.6/lib
添加这个就可以使用trtexec
export PATH=/home/work/guopei/TensorRT-8.0.1.6/bin:$PATH
trtexec --onnx=resnet50/model.onnx --saveEngine=resnet_engine.trt --explicitBatch --device=5
注意:在onnx转trt之前,最好执行一下命令:
python -m onnxsim test1.oxxn test2.onnx
查看onnx结构:https://lutzroeder.github.io/netron/
四、代码示例import pycuda.autoinitimport pycuda.driver as cudaimport tensorrt as trtimport numpy as npEXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)TRT_LOGGER = trt.Logger(trt.Logger.WARNING)def GiB(val): return val * 1 << 30def ONNX_to_TRT(onnx_model_path=None,trt_engine_path=None,fp16_mode=False): """ 仅适用TensorRT V8版本 生成cudaEngine,并保存引擎文件(仅支持固定输入尺度) fp16_mode: True则fp16预测 onnx_model_path: 将加载的onnx权重路径 trt_engine_path: trt引擎文件保存路径 """ builder = trt.Builder(TRT_LOGGER) network = builder.create_network(EXPLICIT_BATCH) parser = trt.OnnxParser(network, TRT_LOGGER) config = builder.create_builder_config() config.max_workspace_size=GiB(1) if fp16_mode: config.set_flag(trt.BuilderFlag.FP16) with open(onnx_model_path, 'rb') as model: assert parser.parse(model.read()) serialized_engine=builder.build_serialized_network(network, config) with open(trt_engine_path, 'wb') as f: f.write(serialized_engine) # 序列化 print('TensorRT file in ' + trt_engine_path) print('============ONNX->TensorRT SUCCESS============')class TrtModel(): ''' TensorRT infer ''' def __init__(self,trt_path): self.ctx=cuda.Device(0).make_context() stream = cuda.Stream() TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) # Deserialize the engine from file with open(trt_path, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] for binding in engine: print('bingding:', binding, engine.get_binding_shape(binding)) size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) cuda_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(cuda_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): self.input_w = engine.get_binding_shape(binding)[-1] self.input_h = engine.get_binding_shape(binding)[-2] host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) # Store self.stream = stream self.context = context self.engine = engine self.host_inputs = host_inputs self.cuda_inputs = cuda_inputs self.host_outputs = host_outputs self.cuda_outputs = cuda_outputs self.bindings = bindings self.batch_size = engine.max_batch_size def __call__(self,img_np_nchw): ''' TensorRT推理 :param img_np_nchw: 输入图像 ''' self.ctx.push() # Restore stream = self.stream context = self.context engine = self.engine host_inputs = self.host_inputs cuda_inputs = self.cuda_inputs host_outputs = self.host_outputs cuda_outputs = self.cuda_outputs bindings = self.bindings np.copyto(host_inputs[0], img_np_nchw.ravel()) cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream) context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle) cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream) stream.synchronize() self.ctx.pop() return host_outputs[0] def destroy(self): # Remove any context from the top of the context stack, deactivating it. self.ctx.pop()
yolov5-face测试代码:
import osimport sysimport cv2import copyimport torchimport argparseroot_path=os.path.dirname(os.path.abspath(os.path.dirname(__file__))) # 项目根路径:获取当前路径,再上级路径sys.path.append(root_path) # 将项目根路径写入系统路径from utils.general import check_img_size,non_max_suppression_face,scale_coords,xyxy2xywhfrom utils.datasets import letterboxfrom detect_face import scale_coords_landmarks,show_resultsfrom torch2trt.trt_model import TrtModelcur_path=os.path.abspath(os.path.dirname(__file__))def img_process(img_path,long_side=512,stride_max=32): ''' 图像预处理 ''' orgimg=cv2.imread(img_path) img0 = copy.deepcopy(orgimg) h0, w0 = orgimg.shape[:2] # orig hw r = long_side/ max(h0, w0) # resize image to img_size if r != 1: # always resize down, only resize up if training with augmentation interp = cv2.INTER_AREA if r < 1 else cv2.INTER_LINEAR img0 = cv2.resize(img0, (int(w0 * r), int(h0 * r)), interpolation=interp) imgsz = check_img_size(long_side, s=stride_max) # check img_size img = letterbox(img0, new_shape=imgsz,auto=False)[0] # auto True最小矩形 False固定尺度 # Convert img = img[:, :, ::-1].transpose(2, 0, 1).copy() # BGR to RGB, to 3x416x416 img = torch.from_numpy(img) img = img.float() # uint8 to fp16/32 img /= 255.0 # 0 - 255 to 0.0 - 1.0 if img.ndimension() == 3: img = img.unsqueeze(0) return img,orgimgdef img_vis(img,orgimg,pred,vis_thres = 0.3): ''' 预测可视化 vis_thres: 可视化阈值 ''' print('img.shape: ', img.shape) print('orgimg.shape: ', orgimg.shape) no_vis_nums=0 # Process detections for i, det in enumerate(pred): # detections per image gn = torch.tensor(orgimg.shape)[[1, 0, 1, 0]] # normalization gain whwh gn_lks = torch.tensor(orgimg.shape)[[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]] # normalization gain landmarks if len(det): # Rescale boxes from img_size to im0 size det[:, :4] = scale_coords(img.shape[2:], det[:, :4], orgimg.shape).round() # Print results for c in det[:, -1].unique(): n = (det[:, -1] == c).sum() # detections per class det[:, 5:15] = scale_coords_landmarks(img.shape[2:], det[:, 5:15], orgimg.shape).round() for j in range(det.size()[0]): if det[j, 4].cpu().numpy() < vis_thres: no_vis_nums+=1 continue xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(-1).tolist() conf = det[j, 4].cpu().numpy() landmarks = (det[j, 5:15].view(1, 10) / gn_lks).view(-1).tolist() class_num = det[j, 15].cpu().numpy() orgimg = show_results(orgimg, xywh, conf, landmarks, class_num) cv2.imwrite(cur_path+'/result.jpg', orgimg) print('result save in '+cur_path+'/result.jpg')if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--img_path', type=str, default=cur_path+"/test.jpg", help='img path') parser.add_argument('--trt_path', type=str, required=True, help='trt_path') parser.add_argument('--output_shape', type=list, default=[1,25200,16], help='input[1,3,640,640] -> output[1,25200,16]') opt = parser.parse_args() img,orgimg=img_process(opt.img_path) model=TrtModel(opt.trt_path) # pred=model(img.numpy()).reshape(opt.output_shape) # forward pred=model(img.numpy()).reshape([1, 16128, 16]) # forward model.destroy() # Apply NMS pred = non_max_suppression_face(torch.from_numpy(pred), conf_thres=0.3, iou_thres=0.5) # ============可视化================ img_vis(img,orgimg,pred)