#include #include "use_mnn.hpp" #include using namespace std; using namespace GUD::ALG; ForwardNet::ForwardNet(const gud_siamrpn_config_t conf) { #if (1 == D_CPU_BIGCORE) cpu_set_t mask; //CPU核的集合 cpu_set_t get; //获取在集合中的CPU //int num= sysconf(_SC_NPROCESSORS_CONF); int j = 0; CPU_ZERO(&mask); /* 初始化set集,将set置为空*/ /*将本进程绑定到CPU0上*/ CPU_SET(3, &mask); // 3559 core 3 if (sched_setaffinity(0, sizeof(mask), &mask) == -1) { printf("Set CPU affinity failue, ERROR:%s\n", strerror(errno)); exit(-1); } else { printf("Set CPU affinity Sucess, Just run ForwardNet Class\n"); } #endif numThread_ = conf.thread_num; prec_type_ = conf.precision_type; deviceType_ = conf.device_type; t1_ = steady_clock::now(); net_init(0, conf.mnnModel_0); net_init(1, conf.mnnModel_1); t2_ = steady_clock::now(); time_span_ = duration_cast < duration < double >> (t2_ - t1_); printf("#MNN Model Load time is %f ms.\n",1000 * time_span_.count()); clsOut_dims_[2] = 1; clsOut_dims_[3] = 1; regOut_dims_[2] = 1; regOut_dims_[3] = 1; updataCKDone_ = 0; updataRKDone_ = 0; /** create inputs & output tensor for two networks **/ clsMapTensor_ = MNN::Tensor::create(clsMap_dims_, NULL, MNN::Tensor::CAFFE); clsKernelTensor_ = MNN::Tensor::create(ck_dims_, NULL, MNN::Tensor::CAFFE); regMapTensor_ = MNN::Tensor::create(regMap_dims_, NULL, MNN::Tensor::CAFFE); regKernelTensor_ = MNN::Tensor::create(rk_dims_, NULL, MNN::Tensor::CAFFE); clsOutTensor_ = MNN::Tensor::create(clsOut_dims_, NULL, MNN::Tensor::CAFFE); regOutTensor_ = MNN::Tensor::create(regOut_dims_, NULL, MNN::Tensor::CAFFE); #if (DEBUG_TMP == 1) printf ("Create Tensor clsOut_dims_[%d %d %d %d]\n",clsOut_dims_[0],clsOut_dims_[1], clsOut_dims_[2], clsOut_dims_[3]); printf ("Create Tensor RegOut_dims_[%d %d %d %d]\n",regOut_dims_[0],regOut_dims_[1], regOut_dims_[2], regOut_dims_[3]); #endif } ForwardNet::~ForwardNet() { net_deinit(0); net_deinit(1); } int ForwardNet::net_init(int model_idx,const char* model_path) { //numThread_ = 4; printf ("Start Loading Net:[%d] (%s)#\n",model_idx,model_path); if (model_idx == 0) { /** create network session **/ auto net = std::shared_ptr(MNN::Interpreter::createFromFile(model_path)); MNN::ScheduleConfig config; if (deviceType_ == 0){ config.type = MNN_FORWARD_CPU; } else if (deviceType_ == 1){ config.type = MNN_FORWARD_OPENCL; // MNN_FORWARD_CPU==0; MNN_FORWARD_OPENCL==3 } config.numThread = numThread_; net_cls_ = net; MNN::BackendConfig bnconfig; //bnconfig.memory = MNN::BackendConfig::Memory_Low; //bnconfig.power = MNN::BackendConfig::Power_Low; if (prec_type_ == 0) { bnconfig.precision = MNN::BackendConfig::Precision_Normal; } else if (prec_type_ == 1) { bnconfig.precision = MNN::BackendConfig::Precision_High; } else if (prec_type_ == 2) { bnconfig.precision = MNN::BackendConfig::Precision_Low; } else if (prec_type_ == 3) { bnconfig.precision = MNN::BackendConfig::Precision_Low_BF16; } config.backendConfig = &bnconfig; MNNconfig_cls_ = config; auto session = net->createSession(config); /** get inputs & outputs shape **/ auto inputs = net->getSessionInputAll(session); // multi inputs ck_dims_ = inputs[clsKernelName]->shape(); clsMap_dims_ = inputs[clsMapName]->shape(); auto Routput0 = net->getSessionOutput(session, clsOutputName.data()); // signal output clsOut_dims_ = Routput0->shape(); session_cls_ = session; /** set inputs cls tensor **/ inputClsTensor = net_cls_->getSessionInputAll(session_cls_); #if (DEBUG_TMP == 1) printf ("ck_dims_[%d %d %d %d]\n",ck_dims_[0],ck_dims_[1], ck_dims_[2], ck_dims_[3]); printf ("clsMap_dims_[%d %d %d %d]\n",clsMap_dims_[0],clsMap_dims_[1], clsMap_dims_[2], clsMap_dims_[3]); printf ("clsOut_dims_[%d %d %d %d]\n",clsOut_dims_[0],clsOut_dims_[1], clsOut_dims_[2], clsOut_dims_[3]); float memoryUsage = 0.0f; net->getSessionInfo(session, MNN::Interpreter::MEMORY, &memoryUsage); float flops = 0.0f; net->getSessionInfo(session, MNN::Interpreter::FLOPS, &flops); int backendType[2]; net->getSessionInfo(session, MNN::Interpreter::BACKENDS, backendType); MNN_PRINT("RefSession Info: memory use [%f] MB, flops is [%f] M, backendType is [%d], PrecisionMode:[%d] PowerMode:[%d]\n", memoryUsage, flops, backendType[0], bnconfig.precision,bnconfig.power); #endif printf ("#MNN NetInit Load clsModel:{%s} Sucess; Thread_num:[%d] Device:[%d] #\n",model_path, numThread_,config.type); } else if (model_idx == 1) { /** create network session **/ auto net = std::shared_ptr(MNN::Interpreter::createFromFile(model_path)); MNN::ScheduleConfig config; if (deviceType_ == 0){ config.type = MNN_FORWARD_CPU; } else if (deviceType_ == 1){ config.type = MNN_FORWARD_OPENCL; // MNN_FORWARD_CPU==0; MNN_FORWARD_OPENCL==3 } config.numThread = numThread_; net_reg_ = net; MNN::BackendConfig bnconfig; //bnconfig.memory = MNN::BackendConfig::Memory_Low; //bnconfig.power = MNN::BackendConfig::Power_Low; if (prec_type_ == 0) { bnconfig.precision = MNN::BackendConfig::Precision_Normal; } else if (prec_type_ == 1) { bnconfig.precision = MNN::BackendConfig::Precision_High; } else if (prec_type_ == 2) { bnconfig.precision = MNN::BackendConfig::Precision_Low; } else if (prec_type_ == 3) { bnconfig.precision = MNN::BackendConfig::Precision_Low_BF16; } config.backendConfig = &bnconfig; MNNconfig_reg_ = config; auto session = net->createSession(config); /** get inputs & outputs shape **/ auto inputs = net->getSessionInputAll(session); // multi inputs rk_dims_ = inputs[regKernelNane]->shape(); regMap_dims_ = inputs[regMapName]->shape(); auto Routput1 = net->getSessionOutput(session, regOutputName.data()); // signal output regOut_dims_ = Routput1->shape(); session_reg_ = session; /** set inputs reg tensor **/ inputRegTensor = net_reg_->getSessionInputAll(session_reg_); #if (DEBUG_TMP == 1) printf ("rk_dims_[%d %d %d %d]\n",rk_dims_[0],rk_dims_[1], rk_dims_[2], rk_dims_[3]); printf ("regMap_dims_[%d %d %d %d]\n",regMap_dims_[0],regMap_dims_[1], regMap_dims_[2], regMap_dims_[3]); printf ("regOut_dims_[%d %d %d %d]\n",regOut_dims_[0],clsOut_dims_[1], regOut_dims_[2], regOut_dims_[3]); float memoryUsage = 0.0f; net->getSessionInfo(session, MNN::Interpreter::MEMORY, &memoryUsage); float flops = 0.0f; net->getSessionInfo(session, MNN::Interpreter::FLOPS, &flops); int backendType[2]; net->getSessionInfo(session, MNN::Interpreter::BACKENDS, backendType); MNN_PRINT("SearSession Info: memory use [%f] MB, flops is [%f] M, backendType is [%d], PrecisionMode:[%d] PowerMode:[%d]\n", memoryUsage, flops, backendType[0], bnconfig.precision, bnconfig.power); #endif printf ("#MNN NetInit Load regModel:{%s} Sucess; Thread_num:[%d] Device:[%d]#\n",model_path, numThread_, config.type); } #if (USE_DOUBLE_HEADER == 1) else if (model_idx == 2) { /** net input & output names **/ std::string clsKernelInput = "clskernel"; std::string clsMapInput = "x"; std::string clsOutput = "57"; std::string regKernelInput = "regkernel"; std::string regMapInput = "y"; std::string regOutput = "62"; /** create network session **/ auto net = std::shared_ptr(MNN::Interpreter::createFromFile(model_path)); MNN::ScheduleConfig config; config.type = MNN_FORWARD_OPENCL; // MNN_FORWARD_CPU MNN_FORWARD_OPENCL config.numThread = numThread_; if (config.type == MNN_FORWARD_OPENCL) config.mode = MNN_GPU_TUNING_FAST; net_double_ = net; MNN::BackendConfig bnconfig; //bnconfig.memory = MNN::BackendConfig::Memory_Low; //bnconfig.power = MNN::BackendConfig::Power_Low; bnconfig.precision = MNN::BackendConfig::Precision_Low; // Precision_Normal = 0, Precision_High, Precision_Low Precision_Low_BF16 config.backendConfig = &bnconfig; MNNconfig_double_ = config; //net->setSessionMode(MNN::Interpreter::Session_Input_User); auto session = net->createSession(config); /** get inputs & outputs shape **/ auto inputs = net->getSessionInputAll(session); // multi inputs ck_dims_ = inputs[clsKernelInput]->shape(); clsMap_dims_ = inputs[clsMapInput]->shape(); rk_dims_ = inputs[regKernelInput]->shape(); regMap_dims_ = inputs[regMapInput]->shape(); auto Coutput1 = net->getSessionOutput(session, regOutput.data()); // signal output clsOut_dims_ = Coutput1->shape(); auto Routput1 = net->getSessionOutput(session, regOutput.data()); // signal output regOut_dims_ = Routput1->shape(); session_double_ = session; /** set inputs reg tensor **/ input4Tensors = net_double_->getSessionInputAll(session_double_); input4Tensors[clsKernelInput] = clsKernelTensor_; input4Tensors[clsMapInput] = clsMapTensor_; input4Tensors[regKernelInput] = regKernelTensor_; input4Tensors[regMapInput] = regMapTensor_; // net_double_->resizeSession(session_double_); #if (DEBUG_TMP == 1) printf ("ck_dims_[%d %d %d %d]\n",ck_dims_[0],ck_dims_[1], ck_dims_[2], ck_dims_[3]); printf ("clsMap_dims_[%d %d %d %d]\n",clsMap_dims_[0],clsMap_dims_[1], clsMap_dims_[2], clsMap_dims_[3]); printf ("clsOut_dims_[%d %d %d %d]\n",clsOut_dims_[0],clsOut_dims_[1], clsOut_dims_[2], clsOut_dims_[3]); printf ("rk_dims_[%d %d %d %d]\n",rk_dims_[0],rk_dims_[1], rk_dims_[2], rk_dims_[3]); printf ("regMap_dims_[%d %d %d %d]\n",regMap_dims_[0],regMap_dims_[1], regMap_dims_[2], regMap_dims_[3]); printf ("regOut_dims_[%d %d %d %d]\n",regOut_dims_[0],clsOut_dims_[1], regOut_dims_[2], regOut_dims_[3]); float memoryUsage = 0.0f; net->getSessionInfo(session, MNN::Interpreter::MEMORY, &memoryUsage); float flops = 0.0f; net->getSessionInfo(session, MNN::Interpreter::FLOPS, &flops); int backendType[2]; net->getSessionInfo(session, MNN::Interpreter::BACKENDS, backendType); MNN_PRINT("SearSession Info: memory use [%f] MB, flops is [%f] M, backendType is [%d], PrecisionMode:[%d] PowerMode:[%d]\n", memoryUsage, flops, backendType[0], bnconfig.precision, bnconfig.power); #endif printf ("#MNN NetInit Load DoubleHeaderModel:{%s} Sucess!; Thread_num:[%d] Device:[%d] #\n", model_path, numThread_, config.type); } #endif initialized_ = true; printf("*** Loading Model sucess!!! ***\n"); return 0; } int ForwardNet::net_deinit(int model_idx) { if (model_idx == 0) net_cls_->releaseSession(session_cls_); if (model_idx == 1) net_reg_->releaseSession(session_reg_); return 0; } int ForwardNet::clsMnnForward(const float *cmap_data, float* coutput_data, bool dataReady) { t1_ = steady_clock::now(); if (!dataReady){ auto clsM_data = clsMapTensor_->host(); auto clsM_size = static_cast(clsMapTensor_->elementSize()); ::memcpy(clsM_data, cmap_data, sizeof(float)*clsM_size); } auto clsKernelInput = inputClsTensor[clsKernelName]; auto clsMapInput = inputClsTensor[clsMapName]; clsKernelInput->copyFromHostTensor(clsKernelTensor_); clsMapInput->copyFromHostTensor(clsMapTensor_); /**run network **/ net_cls_->runSession(session_cls_); /**get output data **/ auto outTensor = net_cls_->getSessionOutput(session_cls_, clsOutputName.data()); auto outshape = outTensor->shape(); int outsize = outTensor->elementSize(); int outClsSize = outshape[0]*outshape[1]; outTensor->copyToHostTensor(clsOutTensor_); //printf ("######### showTensor(clsOutTensor_) ##############\n"); //showTensor(clsOutTensor_,outClsSize,5*19*19); ::memcpy(coutput_data, clsOutTensor_->host(), sizeof(float)*outClsSize); t2_ = steady_clock::now(); #if (DEBUG_TMP == 1) time_span_ = duration_cast < duration < double >> (t2_ - t1_); printf("#Run cls getSessionOutput time is %f ms.\n",1000 * time_span_.count()); #endif return 0; } int ForwardNet::regMnnForward(const float *rmap_data, float* routput_data, bool dataReady) { t1_ = steady_clock::now(); if (!dataReady) { auto regM_data = regMapTensor_->host(); auto regM_size = static_cast(regMapTensor_->elementSize()); ::memcpy(regM_data, rmap_data, sizeof(float)*regM_size); } auto regKernelInput = inputRegTensor[regKernelNane]; auto regMapInput = inputRegTensor[regMapName]; regKernelInput->copyFromHostTensor(regKernelTensor_); regMapInput->copyFromHostTensor(regMapTensor_); /**run network **/ net_reg_->runSession(session_reg_); /**get output data **/ auto outTensor = net_reg_->getSessionOutput(session_reg_, regOutputName.data()); auto outshape = outTensor->shape(); int outsize = outTensor->elementSize(); int outRegSize = outshape[0]*outshape[1]; outTensor->copyToHostTensor(regOutTensor_); //printf ("######### showTensor(regOutTensor_) ##############\n"); //showTensor(regOutTensor_,outRegSize,5*19*19); ::memcpy(routput_data, regOutTensor_->host(), sizeof(float)*outRegSize); t2_ = steady_clock::now(); #if (DEBUG_TMP == 1) time_span_ = duration_cast < duration < double >> (t2_ - t1_); printf("#Run reg getSessionOutput time is %f ms.\n",1000 * time_span_.count()); #endif return 0; } int ForwardNet::updateTwoKernels(const float *ckernel_data, const float *rkernel_data) { auto clsK_data = clsKernelTensor_->host(); auto clsK_size = static_cast(clsKernelTensor_->elementSize()); auto regK_data = regKernelTensor_->host(); auto regK_size = static_cast(regKernelTensor_->elementSize()); auto cshape = clsKernelTensor_->shape(); auto rshape = regKernelTensor_->shape(); clsK_size = cshape[0] * cshape[1] * cshape[2] * cshape[3]; regK_size = rshape[0] * rshape[1] * rshape[2] * rshape[3]; t1_ = steady_clock::now(); ::memcpy(clsK_data, ckernel_data, sizeof(float)*clsK_size); ::memcpy(regK_data, rkernel_data, sizeof(float)*regK_size); t2_ = steady_clock::now(); time_span_ = duration_cast < duration < double >> (t2_ - t1_); updataRKDone_ = 1; updataCKDone_ = 1; // printf("#Run updateTwoKernels time is %f ms.\n",1000 * time_span_.count()); // printf ("######### showTensor(clsKernelTensor_) ##############\n"); // showTensor(clsKernelTensor_,10*256*4*4,256*4*4); // printf ("######### showTensor(regKernelTensor_) ##############\n"); // showTensor(regKernelTensor_,20*256*4*4,256*4*4); return 0; } int ForwardNet::updateTwoMaps(const float *cm_data, const float *rm_data) { auto cMapData = clsMapTensor_->host(); auto cm_size = static_cast(clsMapTensor_->elementSize()); auto rMapData = regMapTensor_->host(); auto rm_size = static_cast(regMapTensor_->elementSize()); auto cmshape = clsMapTensor_->shape(); auto rmshape = regMapTensor_->shape(); int clsM_size = cmshape[0] * cmshape[1] * cmshape[2] * cmshape[3]; int regM_size = rmshape[0] * rmshape[1] * rmshape[2] * rmshape[3]; t1_ = steady_clock::now(); ::memcpy(cMapData, cm_data, sizeof(float)*clsM_size); ::memcpy(rMapData, rm_data, sizeof(float)*regM_size); t2_ = steady_clock::now(); time_span_ = duration_cast < duration < double >> (t2_ - t1_); //printf("#Run updateTwoMaps time is %f ms.\n",1000 * time_span_.count()); updataRMDone_ = 1; updataCMDone_ = 1; return 0; } void ForwardNet::showTensor(MNN::Tensor *TensorIn, int len, int diff) { auto Tshape = TensorIn->shape(); auto Tdata = TensorIn->host(); auto Tsize = static_cast(TensorIn->size()); printf ("\n***** Tensor shape:[%d %d %d %d], size:%d ******\n",Tshape[0],Tshape[1],Tshape[2],Tshape[3],Tsize); printf ("DimensionType: %d dimensions:%d elementSize:%d\n",TensorIn->getDimensionType(), TensorIn->dimensions(), TensorIn->elementSize()); int lenN = 1; for (int i = 0; i < len; ++i) { //printf ("%.4f ",Tdata[i]); printf ("%.3f ",Tdata[i]); if (i%(diff) == 0 and i!=0){ printf("\n#[%d]#\n",lenN); lenN++; } } printf ("\n***** END ******\n"); }