各位大佬,这是我的自学笔记,如有错误请指正,也欢迎在评论区学习交流,谢谢!
本次笔记的代码来自英伟达官方在B站上的课程分享的链接(详见笔记001),我略微修改了代码,并加上了大量注释(详见下文,我附上了全部代码)
本项目的功能是创建一个只有一个identityLayer(恒等映射层)的网络、序列化网络并将其保存为.plan文件,随后加载.plan文件,为该网络输入一个维度为3x4x5张量,打印其计算结果
一个标准的TRT运行流程在推理时一般是这样的,先加载引擎,加载网络,为网络添加权重和偏置,然后分配内存,分配显存,数据预处理,将数据从内存移至显存,让GPU进行运算,将运算完的存在显存中的数据移回内存,数据后处理(数据的预处理和后处理我们也可以用GPU来做,哪样的话,就要自己写核函数和配置plugin),输出数据。
项目使用的编译器不是gcc或g++,而是nvcc,在MakeFile中使用CCFLAG 设置里编译的配置参数
...
NVCC = $(CUDA_PATH)/bin/nvcc
...
CCFLAG = -std=c++14 -O3 $(DEBUG_MACRO) -Xcompiler -fPIC -use_fast_math
...
本项目在创建Network时,设置了显性batch,只有显性batch支持Dynamic shape,Dynamic shape条件下,我们可以在配置network时,只给network一个输入张量的维度的范围值,而非确定值,然后,在实际推理运行时,再确定这个network的输入张量的维度,这种做法非常灵活。
配置network时先给一个范围值
profile->setDimensions(inputTensor->getName(), nvinfer1::OptProfileSelector::kMIN, Dims32 {3, {1, 1, 1}});//配置最小输入维度
profile->setDimensions(inputTensor->getName(), nvinfer1::OptProfileSelector::kOPT, Dims32 {3, {3, 4, 5}});//配置最佳输入维度
profile->setDimensions(inputTensor->getName(), nvinfer1::OptProfileSelector::kMAX, Dims32 {3, {6, 8, 10}});//配置最大输入维度
实际推理前,再确定InputShape
context->setInputShape(vTensorName[0].c_str(), Dims32 {3, {3, 4, 5}});
其他就没有太重要的知识点了,请看注释,我写了很多注释
./
├── create_save_engine.cpp
├── include
│ ├── cookbookHelper.cuh
│ ├── CreateSaveEngine.h
│ └── LoadRunEngine.h
├── load_run_engine.cpp
├── main.cpp
├── main.py
└── Makefile
1 directory, 8 files
create_save_engine.cpp
#include "./include/CreateSaveEngine.h"
using namespace nvinfer1;
static Logger gLogger(ILogger::Severity::kERROR);
void create_save_engine(std::string engine_file_name){
//创建network
nvinfer1::IBuilder *builder = nvinfer1::createInferBuilder(gLogger);//通过日志记录器创建模型构建器
const auto explicitBatch = 1U << int(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
nvinfer1::INetworkDefinition *network = builder->createNetworkV2(explicitBatch);//通过模型构建器以显性batch形式创建网络,只有显性batch
nvinfer1::IOptimizationProfile *profile = builder->createOptimizationProfile();//创建网络配置选项
nvinfer1::IBuilderConfig *config = builder->createBuilderConfig();//通过模型构建器创建网络属性配置器
nvinfer1::ICudaEngine *engine = nullptr;
config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 1 << 30);//设置mem最大尺寸
//为network配置并添加输入张量
nvinfer1::ITensor *inputTensor = network->addInput("inputT0", nvinfer1::DataType::kFLOAT, Dims32 {3, {-1, -1, -1}});//为网络创建输入张量,KFLOAT即为配置张量数据类型为32位浮点数类型,张量维度为-1即为dynamic shape
profile->setDimensions(inputTensor->getName(), nvinfer1::OptProfileSelector::kMIN, Dims32 {3, {1, 1, 1}});//配置最小输入维度
profile->setDimensions(inputTensor->getName(), nvinfer1::OptProfileSelector::kOPT, Dims32 {3, {3, 4, 5}});//配置最佳输入维度
profile->setDimensions(inputTensor->getName(), nvinfer1::OptProfileSelector::kMAX, Dims32 {3, {6, 8, 10}});//配置最大输入维度
config->addOptimizationProfile(profile);
//为network添加层
nvinfer1::IIdentityLayer *identityLayer = network->addIdentity(*inputTensor);//为network添加一个恒等映射层,所谓恒等映射就是输入原模原样到输出
//指定network的输出张量
network->markOutput(*identityLayer->getOutput(0));
//序列化并保存引擎
nvinfer1::IHostMemory *engineString = builder->buildSerializedNetwork(*network, *config);//创建序列化网络
if (engineString == nullptr || engineString->size() == 0){ std::cout << "创建序列化网络失败!" << std::endl; return; }
std::cout << "成功创建序列化网络!" << std::endl;
nvinfer1::IRuntime *runtime {nvinfer1::createInferRuntime(gLogger)}; //创建runtime
engine = runtime->deserializeCudaEngine(engineString->data(), engineString->size()); //序列化引擎
if (engine == nullptr){ std::cout << "序列化引擎失败!" << std::endl; return; }
std::cout << "成功序列化引擎!" << std::endl;
std::ofstream engineFile(engine_file_name, std::ios::binary);//打开文件
if (!engineFile){ std::cout << "打开文件失败" << std::endl;return; }
engineFile.write(static_cast<char *>(engineString->data()), engineString->size());//以二进制格式保存引擎文件
if (engineFile.fail()){ std::cout << "保存引擎文件失败!" << std::endl;return; }
std::cout << "成功保存引擎文件!" << std::endl;
return;
}
load_run_engine.cpp
#include "./include/LoadRunEngine.h"
using namespace nvinfer1;
static Logger gLogger(ILogger::Severity::kERROR);
void load_run_engine(std::string engine_file_name){
nvinfer1::ICudaEngine *engine = nullptr;
std::ifstream engineFile(engine_file_name, std::ios::binary);//以二进制格式加载engine文件
long int fsize = 0;
//读取引擎文件
engineFile.seekg(0, engineFile.end);
fsize = engineFile.tellg();
engineFile.seekg(0, engineFile.beg);
std::vector<char> engineString(fsize);
engineFile.read(engineString.data(), fsize);
if (engineString.size() == 0){ std::cout << "读取引擎文件失败!" << std::endl;return; }
std::cout << "成功读取引擎文件!" << std::endl;
//创建runtime用于推理
nvinfer1::IRuntime *runtime {nvinfer1::createInferRuntime(gLogger)};
//为runtime加载引擎文件
engine = runtime->deserializeCudaEngine(engineString.data(), fsize);
if (engine == nullptr){ std::cout << "加载引擎失败!" << std::endl;return; }
std::cout << "成功加载引擎!" << std::endl;
long unsigned int nIO = engine->getNbIOTensors();
long unsigned int nInput = 0;
long unsigned int nOutput = 0;
std::vector<std::string> vTensorName(nIO);
for (int i = 0; i < nIO; ++i)
{
vTensorName[i] = std::string(engine->getIOTensorName(i));
nInput += int(engine->getTensorIOMode(vTensorName[i].c_str()) == nvinfer1::TensorIOMode::kINPUT); //统计引擎的输入通道数
nOutput += int(engine->getTensorIOMode(vTensorName[i].c_str()) == nvinfer1::TensorIOMode::kOUTPUT);//统计引擎的输出通道数
}
//创建执行上下文,并用其确定network的input和output尺寸
nvinfer1::IExecutionContext *context = engine->createExecutionContext();
context->setInputShape(vTensorName[0].c_str(), Dims32 {3, {3, 4, 5}});
for (int i = 0; i < nIO; ++i)//打印模型内部结构
{
std::cout << std::string(i < nInput ? "Input [" : "Output[");
std::cout << i << std::string("]-> ");
std::cout << "dataType:" <<dataTypeToString(engine->getTensorDataType(vTensorName[i].c_str())) << std::string(" ");
std::cout << "engine->getTensorShape:" << shapeToString(engine->getTensorShape(vTensorName[i].c_str())) << std::string(" ");
std::cout << "context->getTensorShape:" <<shapeToString(context->getTensorShape(vTensorName[i].c_str())) << std::string(" ");
std::cout << "TensorName:" << vTensorName[i] << std::endl;
std::cout << std::endl;
}
std::vector<int> vTensorSize(nIO, 0);
for (int i = 0; i < nIO; ++i)
{
Dims32 dim = context->getTensorShape(vTensorName[i].c_str());//获得每一个IO的张量形状(TensorShape)
int size = 1;
//Dims32中nbDims对应某个TensorShape的维度个数,d[]对应每个维度的尺寸
for (int j = 0; j < dim.nbDims; ++j)
{
size *= dim.d[j];//这里便是用size统计某个张量(Tensor)中有多少个元素
}
vTensorSize[i] = size * dataTypeToSize(engine->getTensorDataType(vTensorName[i].c_str()));//确定这个张量该分配多大的存储空间
}
std::vector<void *> vBufferH {nIO, nullptr};//host端数据缓冲区,指向内存
std::vector<void *> vBufferD {nIO, nullptr};//device端数据缓冲区,指向显存
for (int i = 0; i < nIO; ++i)
{
vBufferH[i] = (void *)new char[vTensorSize[i]];
CHECK(cudaMalloc(&vBufferD[i], vTensorSize[i]));//按照上面计算的应分配空间分配显存
}
float *pData = (float *)vBufferH[0];
for (int i = 0; i < vTensorSize[0] / dataTypeToSize(engine->getTensorDataType(vTensorName[0].c_str())); ++i)
{
pData[i] = float(i);//向host端数据缓冲区中写入待处理数据
}
for (int i = 0; i < nInput; ++i)
{
CHECK(cudaMemcpy(vBufferD[i], vBufferH[i], vTensorSize[i], cudaMemcpyHostToDevice));//将内存中数据拷贝至显存
}
for (int i = 0; i < nIO; ++i)
{
context->setTensorAddress(vTensorName[i].c_str(), vBufferD[i]);//指定输入输出张量地址,即告诉显卡计算哪里的数据,算完存在哪里
}
context->enqueueV3(0);//执行推理
for (int i = nInput; i < nIO; ++i)
{
CHECK(cudaMemcpy(vBufferH[i], vBufferD[i], vTensorSize[i], cudaMemcpyDeviceToHost));//将运算结构拷贝回内存
}
for (int i = 0; i < nIO; ++i)
{
printArrayInformation((float *)vBufferH[i], context->getTensorShape(vTensorName[i].c_str()), vTensorName[i], true, true);//输出计算结果
}
//释放资源
for (int i = 0; i < nIO; ++i)
{
delete[] (char *)vBufferH[i];
CHECK(cudaFree(vBufferD[i]));
}
return;
}
cookbookHelper.cuh
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef COOKBOOKHELPER_CUH
#define COOKBOOKHELPER_CUH
#include <NvInfer.h>
#include <cassert>
#include <chrono>
#include <cmath>
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <string>
#include <thread>
#include <unistd.h>
#include <vector>
// result value check of cuda runtime
#define CHECK(call) check(call, __LINE__, __FILE__)
inline bool check(cudaError_t e, int iLine, const char *szFile)
{
if (e != cudaSuccess)
{
std::cout << "CUDA runtime API error " << cudaGetErrorName(e) << " at line " << iLine << " in file " << szFile << std::endl;
return false;
}
return true;
}
using namespace nvinfer1;
// plugin debug function
#ifdef DEBUG
#define WHERE_AM_I() \
do \
{ \
printf("%14p[%s]\n", this, __func__); \
} while (0);
#else
#define WHERE_AM_I()
#endif // ifdef DEBUG
#define CEIL_DIVIDE(X, Y) (((X) + (Y)-1) / (Y))
#define ALIGN_TO(X, Y) (CEIL_DIVIDE(X, Y) * (Y))
__global__ static void printGPUHalf(const half *in, const int n = 10)
{
printf("\n");
for (int i = 0; i < n; ++i)
{
printf("%4d:%.3f,", i, float(in[i]));
}
printf("\n");
return;
}
__global__ static void printGPUFloat(float *in, const int n = 10)
{
printf("\n");
for (int i = 0; i < n; ++i)
{
printf("%4d:%3.f,", i, in[i]);
}
printf("\n");
return;
}
__global__ static void printGPUInt(const int *in, const int n = 10)
{
printf("\n");
for (int i = 0; i < n; ++i)
{
printf("%d:%d,", i, in[i]);
}
printf("\n");
return;
}
// TensorRT journal
class Logger : public ILogger
{
public:
Severity reportableSeverity;
Logger(Severity severity = Severity::kINFO):
reportableSeverity(severity) {}
void log(Severity severity, const char *msg) noexcept override
{
if (severity > reportableSeverity)
{
return;
}
switch (severity)
{
case Severity::kINTERNAL_ERROR:
std::cerr << "INTERNAL_ERROR: ";
break;
case Severity::kERROR:
std::cerr << "ERROR: ";
break;
case Severity::kWARNING:
std::cerr << "WARNING: ";
break;
case Severity::kINFO:
std::cerr << "INFO: ";
break;
default:
std::cerr << "VERBOSE: ";
break;
}
std::cerr << msg << std::endl;
}
};
// print the shape of a TensorRT tensor
// void printShape(Dims32 &dim)
// {
// std::cout << "[";
// for (int i = 0; i < dim.nbDims; ++i)
// {
// std::cout << dim.d[i] << ", ";
// }
// std::cout << "]" << std::endl;
// return;
// }
// print data in the array
template<typename T>
void printArrayRecursion(const T *pArray, Dims32 dim, int iDim, int iStart)
{
if (iDim == dim.nbDims - 1)
{
for (int i = 0; i < dim.d[iDim]; ++i)
{
std::cout << std::fixed << std::setprecision(3) << std::setw(6) << double(pArray[iStart + i]) << " ";
}
}
else
{
int nElement = 1;
for (int i = iDim + 1; i < dim.nbDims; ++i)
{
nElement *= dim.d[i];
}
for (int i = 0; i < dim.d[iDim]; ++i)
{
printArrayRecursion<T>(pArray, dim, iDim + 1, iStart + i * nElement);
}
}
std::cout << std::endl;
return;
}
template<typename T>
void printArrayInformation(const T *pArray, Dims32 dim, std::string name = std::string(""), bool bPrintInformation = true, bool bPrintArray = false, int n = 10)
{
// print shape information
std::cout << std::endl;
std::cout << name << ": (";
for (int i = 0; i < dim.nbDims; ++i)
{
std::cout << dim.d[i] << ", ";
}
std::cout << ")" << std::endl;
// print statistic information of the array
if (bPrintInformation)
{
int nElement = 1; // number of elements with batch dimension
for (int i = 0; i < dim.nbDims; ++i)
{
nElement *= dim.d[i];
}
double sum = double(pArray[0]);
double absSum = double(fabs(double(pArray[0])));
double sum2 = double(pArray[0]) * double(pArray[0]);
double diff = 0.0;
double maxValue = double(pArray[0]);
double minValue = double(pArray[0]);
for (int i = 1; i < nElement; ++i)
{
sum += double(pArray[i]);
absSum += double(fabs(double(pArray[i])));
sum2 += double(pArray[i]) * double(pArray[i]);
maxValue = double(pArray[i]) > maxValue ? double(pArray[i]) : maxValue;
minValue = double(pArray[i]) < minValue ? double(pArray[i]) : minValue;
diff += abs(double(pArray[i]) - double(pArray[i - 1]));
}
double mean = sum / nElement;
double var = sum2 / nElement - mean * mean;
std::cout << "absSum=" << std::fixed << std::setprecision(4) << std::setw(7) << absSum << ",";
std::cout << "mean=" << std::fixed << std::setprecision(4) << std::setw(7) << mean << ",";
std::cout << "var=" << std::fixed << std::setprecision(4) << std::setw(7) << var << ",";
std::cout << "max=" << std::fixed << std::setprecision(4) << std::setw(7) << maxValue << ",";
std::cout << "min=" << std::fixed << std::setprecision(4) << std::setw(7) << minValue << ",";
std::cout << "diff=" << std::fixed << std::setprecision(4) << std::setw(7) << diff << ",";
std::cout << std::endl;
// print first n element and last n element
for (int i = 0; i < n; ++i)
{
std::cout << std::fixed << std::setprecision(5) << std::setw(8) << double(pArray[i]) << ", ";
}
std::cout << std::endl;
for (int i = nElement - n; i < nElement; ++i)
{
std::cout << std::fixed << std::setprecision(5) << std::setw(8) << double(pArray[i]) << ", ";
}
std::cout << std::endl;
}
// print the data of the array
if (bPrintArray)
{
printArrayRecursion<T>(pArray, dim, 0, 0);
}
return;
}
template void printArrayInformation(const float *, Dims32, std::string, bool, bool, int);
template void printArrayInformation(const half *, Dims32, std::string, bool, bool, int);
template void printArrayInformation(const char *, Dims32, std::string, bool, bool, int);
template void printArrayInformation(const int *, Dims32, std::string, bool, bool, int);
template void printArrayInformation(const bool *, Dims32, std::string, bool, bool, int);
// get the size in byte of a TensorRT data type
__inline__ size_t dataTypeToSize(DataType dataType)
{
switch (dataType)
{
case DataType::kFLOAT:
return 4;
case DataType::kHALF:
return 2;
case DataType::kINT8:
return 1;
case DataType::kINT32:
return 4;
case DataType::kBOOL:
return 1;
case DataType::kUINT8:
return 1;
case DataType::kFP8:
return 1;
default:
return 4;
}
}
// get the string of a TensorRT shape
__inline__ std::string shapeToString(Dims32 dim)
{
std::string output("(");
if (dim.nbDims == 0)
{
return output + std::string(")");
}
for (int i = 0; i < dim.nbDims - 1; ++i)
{
output += std::to_string(dim.d[i]) + std::string(", ");
}
output += std::to_string(dim.d[dim.nbDims - 1]) + std::string(")");
return output;
}
// get the string of a TensorRT data type
__inline__ std::string dataTypeToString(DataType dataType)
{
switch (dataType)
{
case DataType::kFLOAT:
return std::string("FP32 ");
case DataType::kHALF:
return std::string("FP16 ");
case DataType::kINT8:
return std::string("INT8 ");
case DataType::kINT32:
return std::string("INT32");
case DataType::kBOOL:
return std::string("BOOL ");
default:
return std::string("Unknown");
}
}
// get the string of a TensorRT data format
__inline__ std::string formatToString(TensorFormat format)
{
switch (format)
{
case TensorFormat::kLINEAR:
return std::string("LINE ");
case TensorFormat::kCHW2:
return std::string("CHW2 ");
case TensorFormat::kHWC8:
return std::string("HWC8 ");
case TensorFormat::kCHW4:
return std::string("CHW4 ");
case TensorFormat::kCHW16:
return std::string("CHW16");
case TensorFormat::kCHW32:
return std::string("CHW32");
case TensorFormat::kHWC:
return std::string("HWC ");
case TensorFormat::kDLA_LINEAR:
return std::string("DLINE");
case TensorFormat::kDLA_HWC4:
return std::string("DHWC4");
case TensorFormat::kHWC16:
return std::string("HWC16");
default: return std::string("None ");
}
}
// get the string of a TensorRT layer kind
__inline__ std::string layerTypeToString(LayerType layerType)
{
switch (layerType)
{
case LayerType::kCONVOLUTION: return std::string("CONVOLUTION");
case LayerType::kFULLY_CONNECTED: return std::string("FULLY_CONNECTED");
case LayerType::kACTIVATION: return std::string("ACTIVATION");
case LayerType::kPOOLING: return std::string("POOLING");
case LayerType::kLRN: return std::string("LRN");
case LayerType::kSCALE: return std::string("SCALE");
case LayerType::kSOFTMAX: return std::string("SOFTMAX");
case LayerType::kDECONVOLUTION: return std::string("DECONVOLUTION");
case LayerType::kCONCATENATION: return std::string("CONCATENATION");
case LayerType::kELEMENTWISE: return std::string("ELEMENTWISE");
case LayerType::kPLUGIN: return std::string("PLUGIN");
case LayerType::kUNARY: return std::string("UNARY");
case LayerType::kPADDING: return std::string("PADDING");
case LayerType::kSHUFFLE: return std::string("SHUFFLE");
case LayerType::kREDUCE: return std::string("REDUCE");
case LayerType::kTOPK: return std::string("TOPK");
case LayerType::kGATHER: return std::string("GATHER");
case LayerType::kMATRIX_MULTIPLY: return std::string("MATRIX_MULTIPLY");
case LayerType::kRAGGED_SOFTMAX: return std::string("RAGGED_SOFTMAX");
case LayerType::kCONSTANT: return std::string("CONSTANT");
case LayerType::kRNN_V2: return std::string("RNN_V2");
case LayerType::kIDENTITY: return std::string("IDENTITY");
case LayerType::kPLUGIN_V2: return std::string("PLUGIN_V2");
case LayerType::kSLICE: return std::string("SLICE");
case LayerType::kSHAPE: return std::string("SHAPE");
case LayerType::kPARAMETRIC_RELU: return std::string("PARAMETRIC_RELU");
case LayerType::kRESIZE: return std::string("RESIZE");
case LayerType::kTRIP_LIMIT: return std::string("TRIP_LIMIT");
case LayerType::kRECURRENCE: return std::string("RECURRENCE");
case LayerType::kITERATOR: return std::string("ITERATOR");
case LayerType::kLOOP_OUTPUT: return std::string("LOOP_OUTPUT");
case LayerType::kSELECT: return std::string("SELECT");
case LayerType::kFILL: return std::string("FILL");
case LayerType::kQUANTIZE: return std::string("QUANTIZE"); // Quantize and following layers appears since TensorRT8.0
case LayerType::kDEQUANTIZE: return std::string("DEQUANTIZE");
case LayerType::kCONDITION: return std::string("CONDITION");
case LayerType::kCONDITIONAL_INPUT: return std::string("CONDITIONAL_INPUT");
case LayerType::kCONDITIONAL_OUTPUT: return std::string("CONDITIONAL_OUTPUT");
case LayerType::kSCATTER: return std::string("SCATTER");
case LayerType::kEINSUM: return std::string("EINSUM");
case LayerType::kASSERTION: return std::string("ASSERTION");
case LayerType::kONE_HOT: return std::string("ONE_HOT"); // One hot and following layers appears since TensorRT8.5
case LayerType::kNON_ZERO: return std::string("NON_ZERO");
case LayerType::kGRID_SAMPLE: return std::string("GRID_SAMPLE");
case LayerType::kNMS: return std::string("NMS");
default: return std::string("Unknown");
}
}
#endif
CreateSaveEngine.h
#ifndef CREATESAVEENGINE_H
#define CREATESAVEENGINE_H
#include "cookbookHelper.cuh"
void create_save_engine(std::string engine_file_name);
#endif
LoadRunEngine.h
#ifndef LOADRUNENGINE_H
#define LOADRUNENGINE_H
#include "cookbookHelper.cuh"
void load_run_engine(std::string engine_file_name);
#endif
main.cpp
#include "./include/CreateSaveEngine.h"
#include "./include/LoadRunEngine.h"
#include "./include/cookbookHelper.cuh"
int main(){
std::string engine_file_name {"./model.plan"};
create_save_engine(engine_file_name);
load_run_engine(engine_file_name);
return 0;
}
Makefile
CUDA_PATH = /usr/local/cuda
NVCC = $(CUDA_PATH)/bin/nvcc
TRT_INC_PATH = /usr/include/x86_64-linux-gnu
TRT_LIB_PATH = /usr/lib/x86_64-linux-gnu
GENCODE = -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89
DEBUG_MACRO = -UDEBUG
WARNING_MACRO = -w
CUFLAG = -std=c++14 -O3 $(DEBUG_MACRO) -Xcompiler -fPIC $(GENCODE)
CCFLAG = -std=c++14 -O3 $(DEBUG_MACRO) -Xcompiler -fPIC -use_fast_math
SOFLAG = -shared
INCLUDE = -I. -I$(CUDA_PATH)/include -I$(TRT_INC_PATH)
INCLUDE += -I../../include -I../../../include
INCLUDE += -I./include
LDFLAG = -L$(CUDA_PATH)/lib64 -lcudart -L$(TRT_LIB_PATH) -lnvinfer
SOURCE_CPP = $(shell find . -name '*.cpp' 2>/dev/null)
SOURCE_PY = $(shell find . -name '*.py' 2>/dev/null)
OBJ = $(SOURCE_CPP:.cpp=.o)
DEP = $(OBJ:.o=.d)
TARGET_EXE = main.exe
#TARGET_EXE = $(SOURCE_CPP:.cpp=.exe)
-include $(DEP)
all: $(TARGET_EXE)
$(TARGET_EXE): $(OBJ)
@echo OBJ_list is$(OBJ)
$(NVCC) $(CCFLAG) $(LDFLAG) -o $(TARGET_EXE) $(OBJ)
%.o: %.cpp
@echo SOURCE_CPP_list is$(SOURCE_CPP)
# $(NVCC) $(CCFLAG) $(INCLUDE) -M -MT $@ -o $(@:.o=.d) $<
$(NVCC) $(CCFLAG) $(INCLUDE) -Xcompiler -fPIC -o $@ -c $<
.PHONY: test
test:
make clean
make
python3 $(SOURCE_PY)
rm -rf ./*.plan
./$(TARGET_EXE)
.PHONY: clean
clean:
rm -rf ./*.d ./*.o ./*.so ./*.exe ./*.plan