#include <opencv2/imgproc.hpp>
#include "use_mnn.hpp"
#include <thread>

using namespace std;
using namespace GUD::ALG;

    ForwardNet::ForwardNet(const gud_siamrpn_config_t conf)
    {
#if (1 == D_CPU_BIGCORE)
        cpu_set_t mask;           //CPU核的集合
        cpu_set_t get;            //获取在集合中的CPU

        //int num= sysconf(_SC_NPROCESSORS_CONF);
        int j = 0;
        CPU_ZERO(&mask);    /* 初始化set集，将set置为空*/
        /*将本进程绑定到CPU0上*/
        CPU_SET(3, &mask);   // 3559 core 3

        if (sched_setaffinity(0, sizeof(mask), &mask) == -1)
        {
            printf("Set CPU affinity failue, ERROR:%s\n", strerror(errno));
            exit(-1);
        }
        else
        {
            printf("Set CPU affinity Sucess, Just run ForwardNet Class\n");
        }
#endif
        numThread_  = conf.thread_num;
        prec_type_  = conf.precision_type;
        deviceType_ = conf.device_type;

        t1_ = steady_clock::now();
        net_init(0, conf.mnnModel_0);
        net_init(1, conf.mnnModel_1);

        t2_ = steady_clock::now();
        time_span_ = duration_cast < duration < double >> (t2_ - t1_);
        printf("#MNN Model Load time is %f ms.\n",1000 * time_span_.count());
        clsOut_dims_[2] = 1;
        clsOut_dims_[3] = 1;
        regOut_dims_[2] = 1;
        regOut_dims_[3] = 1;
        updataCKDone_ = 0;
        updataRKDone_ = 0;
        /** create inputs & output tensor for two networks **/
        clsMapTensor_ = MNN::Tensor::create<float>(clsMap_dims_, NULL, MNN::Tensor::CAFFE);
        clsKernelTensor_ = MNN::Tensor::create<float>(ck_dims_, NULL, MNN::Tensor::CAFFE);
        regMapTensor_ = MNN::Tensor::create<float>(regMap_dims_, NULL, MNN::Tensor::CAFFE);
        regKernelTensor_ = MNN::Tensor::create<float>(rk_dims_, NULL, MNN::Tensor::CAFFE);
        clsOutTensor_ = MNN::Tensor::create<float>(clsOut_dims_, NULL, MNN::Tensor::CAFFE);
        regOutTensor_ = MNN::Tensor::create<float>(regOut_dims_, NULL, MNN::Tensor::CAFFE);
#if (DEBUG_TMP == 1)
        printf ("Create Tensor clsOut_dims_[%d %d %d %d]\n",clsOut_dims_[0],clsOut_dims_[1], clsOut_dims_[2], clsOut_dims_[3]);
        printf ("Create Tensor RegOut_dims_[%d %d %d %d]\n",regOut_dims_[0],regOut_dims_[1], regOut_dims_[2], regOut_dims_[3]);
#endif
    }

    ForwardNet::~ForwardNet() {
        net_deinit(0);
        net_deinit(1);
    }

    int ForwardNet::net_init(int model_idx,const char* model_path)
    {
        //numThread_ = 4;
        printf ("Start Loading Net:[%d] (%s)#\n",model_idx,model_path);
        if (model_idx == 0)
        {
            /**  create network session  **/
            auto net = std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromFile(model_path));
            MNN::ScheduleConfig config;
            if (deviceType_ == 0){
                config.type      = MNN_FORWARD_CPU;
            }
            else if (deviceType_ == 1){
                config.type      = MNN_FORWARD_OPENCL;  // MNN_FORWARD_CPU==0; MNN_FORWARD_OPENCL==3
            }

            config.numThread = numThread_;
            net_cls_ = net;
            MNN::BackendConfig bnconfig;
            //bnconfig.memory = MNN::BackendConfig::Memory_Low;
            //bnconfig.power = MNN::BackendConfig::Power_Low;
            if (prec_type_ == 0) {
                bnconfig.precision =  MNN::BackendConfig::Precision_Normal;
            }
            else if (prec_type_ == 1) {
                bnconfig.precision =  MNN::BackendConfig::Precision_High;
            } else if (prec_type_ == 2) {
                bnconfig.precision =  MNN::BackendConfig::Precision_Low;
            }
            else if (prec_type_ == 3) {
                bnconfig.precision =  MNN::BackendConfig::Precision_Low_BF16;
            }
            config.backendConfig = &bnconfig;
            MNNconfig_cls_ = config;
            auto session = net->createSession(config);

            /** get inputs & outputs shape **/
            auto inputs = net->getSessionInputAll(session);    //  multi inputs
            ck_dims_ = inputs[clsKernelName]->shape();
            clsMap_dims_ = inputs[clsMapName]->shape();
            auto Routput0 = net->getSessionOutput(session, clsOutputName.data()); //  signal output
            clsOut_dims_ =  Routput0->shape();
            session_cls_  = session;

            /** set inputs cls tensor **/
            inputClsTensor = net_cls_->getSessionInputAll(session_cls_);

#if (DEBUG_TMP == 1)
            printf ("ck_dims_[%d %d %d %d]\n",ck_dims_[0],ck_dims_[1], ck_dims_[2], ck_dims_[3]);
            printf ("clsMap_dims_[%d %d %d %d]\n",clsMap_dims_[0],clsMap_dims_[1], clsMap_dims_[2], clsMap_dims_[3]);
            printf ("clsOut_dims_[%d %d %d %d]\n",clsOut_dims_[0],clsOut_dims_[1], clsOut_dims_[2], clsOut_dims_[3]);

            float memoryUsage = 0.0f;
            net->getSessionInfo(session, MNN::Interpreter::MEMORY, &memoryUsage);
            float flops = 0.0f;
            net->getSessionInfo(session, MNN::Interpreter::FLOPS, &flops);
            int backendType[2];
            net->getSessionInfo(session, MNN::Interpreter::BACKENDS, backendType);
            MNN_PRINT("RefSession Info: memory use [%f] MB, flops is [%f] M, backendType is [%d], PrecisionMode:[%d] PowerMode:[%d]\n",
                      memoryUsage, flops, backendType[0], bnconfig.precision,bnconfig.power);
#endif
            printf ("#MNN NetInit Load clsModel:{%s} Sucess; Thread_num:[%d] Device:[%d] #\n",model_path, numThread_,config.type);
        }

        else if (model_idx == 1)
        {
            /**  create network session  **/
            auto net = std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromFile(model_path));
            MNN::ScheduleConfig config;
            if (deviceType_ == 0){
                config.type      = MNN_FORWARD_CPU;
            }
            else if (deviceType_ == 1){
                config.type      = MNN_FORWARD_OPENCL;  // MNN_FORWARD_CPU==0; MNN_FORWARD_OPENCL==3
            }
            config.numThread = numThread_;
            net_reg_ = net;
            MNN::BackendConfig bnconfig;
            //bnconfig.memory = MNN::BackendConfig::Memory_Low;
            //bnconfig.power = MNN::BackendConfig::Power_Low;
            if (prec_type_ == 0) {
                bnconfig.precision =  MNN::BackendConfig::Precision_Normal;
            }
            else if (prec_type_ == 1) {
                bnconfig.precision =  MNN::BackendConfig::Precision_High;
            } else if (prec_type_ == 2) {
                bnconfig.precision =  MNN::BackendConfig::Precision_Low;
            }
            else if (prec_type_ == 3) {
                bnconfig.precision =  MNN::BackendConfig::Precision_Low_BF16;
            }
            config.backendConfig = &bnconfig;
            MNNconfig_reg_ = config;
            auto session = net->createSession(config);

            /** get inputs & outputs shape **/
            auto inputs = net->getSessionInputAll(session);    //  multi inputs
            rk_dims_ = inputs[regKernelNane]->shape();
            regMap_dims_ = inputs[regMapName]->shape();
            auto Routput1 = net->getSessionOutput(session, regOutputName.data()); //  signal output
            regOut_dims_ =  Routput1->shape();
            session_reg_  = session;

            /** set inputs reg tensor **/
            inputRegTensor = net_reg_->getSessionInputAll(session_reg_);

#if (DEBUG_TMP == 1)
            printf ("rk_dims_[%d %d %d %d]\n",rk_dims_[0],rk_dims_[1], rk_dims_[2], rk_dims_[3]);
            printf ("regMap_dims_[%d %d %d %d]\n",regMap_dims_[0],regMap_dims_[1], regMap_dims_[2], regMap_dims_[3]);
            printf ("regOut_dims_[%d %d %d %d]\n",regOut_dims_[0],clsOut_dims_[1], regOut_dims_[2], regOut_dims_[3]);
            float memoryUsage = 0.0f;
            net->getSessionInfo(session, MNN::Interpreter::MEMORY, &memoryUsage);
            float flops = 0.0f;
            net->getSessionInfo(session, MNN::Interpreter::FLOPS, &flops);
            int backendType[2];
            net->getSessionInfo(session, MNN::Interpreter::BACKENDS, backendType);
            MNN_PRINT("SearSession Info: memory use [%f] MB, flops is [%f] M, backendType is [%d], PrecisionMode:[%d] PowerMode:[%d]\n",
                      memoryUsage, flops, backendType[0], bnconfig.precision, bnconfig.power);
#endif
            printf ("#MNN NetInit Load regModel:{%s} Sucess; Thread_num:[%d] Device:[%d]#\n",model_path, numThread_, config.type);

        }
#if (USE_DOUBLE_HEADER == 1)
        else if (model_idx == 2)
        {
            /** net input & output names **/
            std::string clsKernelInput = "clskernel";
            std::string clsMapInput = "x";
            std::string clsOutput = "57";

            std::string regKernelInput = "regkernel";
            std::string regMapInput    = "y";
            std::string regOutput = "62";
            /**  create network session  **/
            auto net = std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromFile(model_path));
            MNN::ScheduleConfig config;
            config.type      = MNN_FORWARD_OPENCL;  // MNN_FORWARD_CPU  MNN_FORWARD_OPENCL
            config.numThread = numThread_;
            if (config.type  == MNN_FORWARD_OPENCL)
                config.mode  = MNN_GPU_TUNING_FAST;
            net_double_ = net;
            MNN::BackendConfig bnconfig;
            //bnconfig.memory = MNN::BackendConfig::Memory_Low;
            //bnconfig.power = MNN::BackendConfig::Power_Low;
            bnconfig.precision =  MNN::BackendConfig::Precision_Low;  // Precision_Normal = 0, Precision_High, Precision_Low Precision_Low_BF16
            config.backendConfig = &bnconfig;
            MNNconfig_double_ = config;
            //net->setSessionMode(MNN::Interpreter::Session_Input_User);
            auto session = net->createSession(config);
            /** get inputs & outputs shape **/
            auto inputs = net->getSessionInputAll(session);    //  multi inputs
            ck_dims_ = inputs[clsKernelInput]->shape();
            clsMap_dims_ = inputs[clsMapInput]->shape();
            rk_dims_ = inputs[regKernelInput]->shape();
            regMap_dims_ = inputs[regMapInput]->shape();
            auto Coutput1 = net->getSessionOutput(session, regOutput.data()); //  signal output
            clsOut_dims_ =  Coutput1->shape();
            auto Routput1 = net->getSessionOutput(session, regOutput.data()); //  signal output
            regOut_dims_ =  Routput1->shape();
            session_double_  = session;

            /** set inputs reg tensor **/
            input4Tensors = net_double_->getSessionInputAll(session_double_);
            input4Tensors[clsKernelInput] = clsKernelTensor_;
            input4Tensors[clsMapInput]    = clsMapTensor_;
            input4Tensors[regKernelInput] = regKernelTensor_;
            input4Tensors[regMapInput]    = regMapTensor_;
//            net_double_->resizeSession(session_double_);

#if (DEBUG_TMP == 1)
            printf ("ck_dims_[%d %d %d %d]\n",ck_dims_[0],ck_dims_[1], ck_dims_[2], ck_dims_[3]);
            printf ("clsMap_dims_[%d %d %d %d]\n",clsMap_dims_[0],clsMap_dims_[1], clsMap_dims_[2], clsMap_dims_[3]);
            printf ("clsOut_dims_[%d %d %d %d]\n",clsOut_dims_[0],clsOut_dims_[1], clsOut_dims_[2], clsOut_dims_[3]);
            printf ("rk_dims_[%d %d %d %d]\n",rk_dims_[0],rk_dims_[1], rk_dims_[2], rk_dims_[3]);
            printf ("regMap_dims_[%d %d %d %d]\n",regMap_dims_[0],regMap_dims_[1], regMap_dims_[2], regMap_dims_[3]);
            printf ("regOut_dims_[%d %d %d %d]\n",regOut_dims_[0],clsOut_dims_[1], regOut_dims_[2], regOut_dims_[3]);
            float memoryUsage = 0.0f;
            net->getSessionInfo(session, MNN::Interpreter::MEMORY, &memoryUsage);
            float flops = 0.0f;
            net->getSessionInfo(session, MNN::Interpreter::FLOPS, &flops);
            int backendType[2];
            net->getSessionInfo(session, MNN::Interpreter::BACKENDS, backendType);
            MNN_PRINT("SearSession Info: memory use [%f] MB, flops is [%f] M, backendType is [%d], PrecisionMode:[%d] PowerMode:[%d]\n",
                      memoryUsage, flops, backendType[0], bnconfig.precision, bnconfig.power);
#endif
            printf ("#MNN NetInit Load DoubleHeaderModel:{%s} Sucess!; Thread_num:[%d] Device:[%d] #\n",
                    model_path, numThread_, config.type);

        }
#endif
        initialized_ = true;
        printf("*** Loading Model sucess!!! ***\n");
        return 0;
    }


    int ForwardNet::net_deinit(int model_idx)
    {
        if (model_idx == 0)
            net_cls_->releaseSession(session_cls_);
        if (model_idx == 1)
            net_reg_->releaseSession(session_reg_);
        return 0;
    }

    int ForwardNet::clsMnnForward(const float *cmap_data, float* coutput_data, bool dataReady)
    {
        t1_ = steady_clock::now();
        if (!dataReady){
            auto clsM_data   = clsMapTensor_->host<float>();
            auto clsM_size   = static_cast<int>(clsMapTensor_->elementSize());
            ::memcpy(clsM_data, cmap_data, sizeof(float)*clsM_size);
        }

        auto clsKernelInput = inputClsTensor[clsKernelName];
        auto clsMapInput = inputClsTensor[clsMapName];
        clsKernelInput->copyFromHostTensor(clsKernelTensor_);
        clsMapInput->copyFromHostTensor(clsMapTensor_);
        /**run network **/
        net_cls_->runSession(session_cls_);
        /**get output data **/
        auto outTensor = net_cls_->getSessionOutput(session_cls_, clsOutputName.data());
        auto outshape = outTensor->shape();
        int outsize = outTensor->elementSize();
        int outClsSize = outshape[0]*outshape[1];
        outTensor->copyToHostTensor(clsOutTensor_);
        //printf ("#########  showTensor(clsOutTensor_) ##############\n");
        //showTensor(clsOutTensor_,outClsSize,5*19*19);
        ::memcpy(coutput_data, clsOutTensor_->host<float>(), sizeof(float)*outClsSize);
        t2_ = steady_clock::now();
#if (DEBUG_TMP == 1)
        time_span_ = duration_cast < duration < double >> (t2_ - t1_);
        printf("#Run cls  getSessionOutput time is %f ms.\n",1000 * time_span_.count());
#endif
        return 0;
    }

    int ForwardNet::regMnnForward(const float *rmap_data, float* routput_data, bool dataReady)
    {
        t1_ = steady_clock::now();
        if (!dataReady) {
            auto regM_data   = regMapTensor_->host<float>();
            auto regM_size   = static_cast<int>(regMapTensor_->elementSize());
            ::memcpy(regM_data, rmap_data, sizeof(float)*regM_size);
        }

        auto regKernelInput = inputRegTensor[regKernelNane];
        auto regMapInput = inputRegTensor[regMapName];
        regKernelInput->copyFromHostTensor(regKernelTensor_);
        regMapInput->copyFromHostTensor(regMapTensor_);
        /**run network **/
        net_reg_->runSession(session_reg_);
        /**get output data **/
        auto outTensor = net_reg_->getSessionOutput(session_reg_, regOutputName.data());
        auto outshape = outTensor->shape();
        int outsize = outTensor->elementSize();
        int outRegSize = outshape[0]*outshape[1];
        outTensor->copyToHostTensor(regOutTensor_);
        //printf ("#########  showTensor(regOutTensor_) ##############\n");
        //showTensor(regOutTensor_,outRegSize,5*19*19);
        ::memcpy(routput_data, regOutTensor_->host<float>(), sizeof(float)*outRegSize);
        t2_ = steady_clock::now();
#if (DEBUG_TMP == 1)
        time_span_ = duration_cast < duration < double >> (t2_ - t1_);
        printf("#Run reg getSessionOutput time is %f ms.\n",1000 * time_span_.count());
#endif
        return 0;
    }

    int ForwardNet::updateTwoKernels(const float *ckernel_data, const float *rkernel_data)
    {
        auto clsK_data   = clsKernelTensor_->host<float>();
        auto clsK_size   = static_cast<int>(clsKernelTensor_->elementSize());
        auto regK_data   = regKernelTensor_->host<float>();
        auto regK_size   = static_cast<int>(regKernelTensor_->elementSize());

        auto cshape  = clsKernelTensor_->shape();
        auto rshape   = regKernelTensor_->shape();

        clsK_size = cshape[0] * cshape[1] * cshape[2] * cshape[3];
        regK_size = rshape[0] * rshape[1] * rshape[2] * rshape[3];

        t1_ = steady_clock::now();
        ::memcpy(clsK_data, ckernel_data, sizeof(float)*clsK_size);
        ::memcpy(regK_data, rkernel_data, sizeof(float)*regK_size);
        t2_ = steady_clock::now();
        time_span_ = duration_cast < duration < double >> (t2_ - t1_);
        updataRKDone_ = 1;
        updataCKDone_ = 1;
//        printf("#Run updateTwoKernels time is %f ms.\n",1000 * time_span_.count());
//        printf ("#########  showTensor(clsKernelTensor_) ##############\n");
//        showTensor(clsKernelTensor_,10*256*4*4,256*4*4);
//        printf ("#########  showTensor(regKernelTensor_) ##############\n");
//        showTensor(regKernelTensor_,20*256*4*4,256*4*4);
        return 0;
    }

    int ForwardNet::updateTwoMaps(const float *cm_data, const float *rm_data)
    {
        auto cMapData  = clsMapTensor_->host<float>();
        auto cm_size   = static_cast<int>(clsMapTensor_->elementSize());
        auto rMapData  = regMapTensor_->host<float>();
        auto rm_size   = static_cast<int>(regMapTensor_->elementSize());

        auto cmshape   = clsMapTensor_->shape();
        auto rmshape   = regMapTensor_->shape();

        int clsM_size = cmshape[0] * cmshape[1] * cmshape[2] * cmshape[3];
        int regM_size = rmshape[0] * rmshape[1] * rmshape[2] * rmshape[3];
        t1_ = steady_clock::now();
        ::memcpy(cMapData, cm_data, sizeof(float)*clsM_size);
        ::memcpy(rMapData, rm_data, sizeof(float)*regM_size);
        t2_ = steady_clock::now();
        time_span_ = duration_cast < duration < double >> (t2_ - t1_);
        //printf("#Run updateTwoMaps time is %f ms.\n",1000 * time_span_.count());
        updataRMDone_ = 1;
        updataCMDone_ = 1;

        return 0;
    }

    void ForwardNet::showTensor(MNN::Tensor *TensorIn, int len, int diff)
    {
        auto Tshape  = TensorIn->shape();
        auto Tdata   = TensorIn->host<float>();
        auto Tsize   = static_cast<int>(TensorIn->size());
        printf ("\n***** Tensor shape:[%d %d %d %d], size:%d ******\n",Tshape[0],Tshape[1],Tshape[2],Tshape[3],Tsize);
        printf ("DimensionType: %d dimensions:%d elementSize:%d\n",TensorIn->getDimensionType(), TensorIn->dimensions(), TensorIn->elementSize());
        int lenN = 1;
        for (int i = 0; i < len; ++i)
        {
            //printf ("%.4f ",Tdata[i]);
            printf ("%.3f ",Tdata[i]);
            if (i%(diff) == 0 and i!=0){
                printf("\n#[%d]#\n",lenN);
                lenN++;
            }
        }
        printf ("\n***** END ******\n");
    }