S3315_RK3588/AIDetector/x86_cuda/src/imageMat.cpp

/*
 * @Author: turboLIU
 * @Date: 2022-12-28 16:59:41
 * @LastEditTime: 2023-11-08 14:33:54
 * @Description: Do not edit
 * @FilePath: /C/src/imageMat.cpp
 */

#include <string.h>
#include <cmath>
#include <vector>
#include "mat_pixel_resize.h"
#include "imageMat.h"


// float single_bilinear(float x, float x0, float x1, float data0, float data1){
//     float alpha = abs(x-x0);
//     return alpha*data0 + (1-alpha) * data1;
// }

TensorMat::TensorMat(){}

TensorMat::~TensorMat(){
    if(this->data){
        // printf("in free\n");
        free(this->data);
        // printf("out free\n");
        this->data = NULL;
    }
}

/*TensorMat::TensorMat(int b, int c, int h, int w, DATA_LAYOUT layout, DATATYPE type) {
    this->width = w;
    this->height = h;
    this->depth = c;
    this->batchsize = b;
    this->data = nullptr;
    this->layout = layout;
    this->dataType = type;

    int sz = w*h*c*b;
    this->elementNum = sz;
    if(type == AI_UINT8 || type == AI_INT8){
        this->data = (unsigned char*)malloc(sz * sizeof(unsigned char));
        this->dataByte = sz;
        // DEBUG_LOG("making TensorMat: (bwhc) %dx%dx%dx%d, type: AI_INT8/AI_UINT8\n", b,w,h,c);
    }else if(type==AI_FLOAT || type == AI_INT32 || type == AI_UINT32){
        this->data = (unsigned char*)malloc(sz * 4);
        this->dataByte = sz*4;
        // DEBUG_LOG("making TensorMat: (bwhc) %dx%dx%dx%d, type: AI_INT32/AI_UINT32/AI_FLOAT\n", b,w,h,c);
    }else if(type==AI_INT16 || type == AI_FLOAT16 || type == AI_UINT16){
        this->data = (unsigned char*)malloc(sz * 2);
        this->dataByte = sz*2;
        // DEBUG_LOG("making TensorMat: (bwhc) %dx%dx%dx%d, type: AI_INT16/AI_UINT16/AI_FLOAT16\n", b,w,h,c);
    }else if(type == AI_INT64 || type == AI_UINT64 || type == AI_DOUBLE){
        this->data = (unsigned char*)malloc(sz * 8);
        this->dataByte = sz*8;
        // DEBUG_LOG("making TensorMat: (bwhc) %dx%dx%dx%d, type: AI_INT64/AI_UINT64/AI_DOUBEL\n", b,w,h,c);
    }else if(type == AI_BOOL){
        this->data = (bool*)malloc(sz*sizeof(bool));
        this->dataByte = sz * sizeof(bool);
    }else{
        DEBUG_LOG("datatype error\n");
    }

};

TensorMat::TensorMat(int b, int c, int h, int w,  DATA_LAYOUT layout, void* pdata, DATATYPE type){
    this->width = w;
    this->height = h;
    this->depth = c;
    this->batchsize = b;
    this->data = nullptr;
    this->layout = layout;
    this->dataType = type;

    int sz = w*h*c*b;
    this->elementNum = sz;
    if(type == AI_UINT8 || type == AI_INT8){
        this->data = (unsigned char*)malloc(sz * sizeof(unsigned char));
        memcpy(this->data, pdata, sz * sizeof(unsigned char));
        this->dataByte = sz * 1;
    }else if(type==AI_FLOAT || type == AI_INT32 || type == AI_UINT32){
        this->data = (unsigned char*)malloc(sz * 4);
        memcpy(this->data, pdata, sz * sizeof(float));
        this->dataByte = sz * 4;
    }else if(type==AI_INT16 || type == AI_FLOAT16 || type == AI_UINT16){
        this->data = (unsigned char*)malloc(sz * 2);
        memcpy(this->data, pdata, sz * 2);
        this->dataByte = sz * 2;
    }else if(type == AI_INT64 || type == AI_UINT64 || type == AI_DOUBLE){
        this->data = (unsigned char*)malloc(sz * 8);
        memcpy(this->data, pdata, sz * 8);
        this->dataByte = sz * 8;
    }else if(type == AI_BOOL){
        this->data = (unsigned char*)malloc(sz * sizeof(bool));
        memcpy(this->data, pdata, sz * sizeof(bool));
        this->dataByte = sz * sizeof(bool);
    }else{
        printf("datatype error\n");
    }

};*/

int TensorMat::toTenMat(TenMat &dst){
    if(!dst.data){
        printf("TenMat is NULL\n");
        return 1;
    }
    memcpy(dst.data, this->data, this->dataByte);
    dst.width = this->width;
    dst.height = this->height;
    dst.depth = this->depth;
    dst.type = this->dataType;
    dst.layout = this->layout;
    dst.elementNum = this->width * this->height * this->depth;
    return 0;

};
int TensorMat::resize_nearest_nhwc(int width, int height){
    //DEBUG_LOG("resize_nearest_nhwc not support for now");
    return 1;
};
int TensorMat::resize_nearest_nchw(int newwidth, int newheight){
    int ret = 0;
    if(this->data == NULL){
        //DEBUG_LOG("TensorMat is empty, should init first");
        return 1;
    }
    if(this->width == newwidth && this->height == newheight)return 0;
    float h_scale = (float)this->height / newheight;
    float w_scale = (float)this->width / newwidth;
    float* ndata = (float*)malloc(this->batchsize*this->depth*newheight*newwidth*sizeof(float));
    float* newdata = ndata;
    float* input = (float*)this->data;
    for (int n = 0; n < this->batchsize; ++n) {
        for (int c = 0; c < this->depth; ++c) {
            for (int y = 0; y < newheight; ++y) {
                for (int x = 0; x < newwidth; ++x) {
                    int i = x * w_scale;
                    int j = y * h_scale;
                    newdata[y * newwidth + x] = input[j * this->width + i];
                }
            }
        input += this->height * this->width;
        newdata += newheight * newwidth;
        }
    }
    free(this->data);
    this->data = ndata;
    this->width = newwidth;
    this->height = newheight;
    return ret;
};

int TensorMat::resize_bilinear_nchw(int newwidth, int newheight){
    int ret = 0;
    if(this->data == NULL){
        //DEBUG_LOG("TensorMat is empty, should init first");
        return 1;
    }
    if(this->width == newwidth && this->height == newheight)return 0;
    float* ndata = (float*)malloc(this->batchsize*this->depth*newheight*newwidth*sizeof(float));
    float* newdata = ndata;
    float* input = (float*)this->data;
    // printf("\n%f\n", input[0]);
    float h_scale = (float)this->height / newheight;
    float w_scale = (float)this->width / newwidth;
    // printf("\nscale: %f/%f\n", h_scale, w_scale);
    for (int n = 0; n < this->batchsize; ++n) {
        for (int c = 0; c < this->depth; ++c) {
            float * inputc = input + n * this->depth * this->width * this->height + c * this->width * this->height;
            float * newdatac = newdata + n*this->depth*newheight*newwidth + c * newwidth * newheight;
            resize_bilinear_float_c1(inputc, this->width, this->height, newdatac, newwidth, newheight);
        }
    }
    free(this->data);
    this->data = (void*)ndata;
    this->width = newwidth;
    this->height = newheight;
    return ret;
}

int TensorMat::resize_bilinear_nhwc(int newwidth, int newheight){
    int ret = 0;
    return 1;
    if(this->data == NULL){
        //DEBUG_LOG("TensorMat is empty, should init first");
        return 1;
    }
    if(this->width == newwidth && this->height == newheight)return 0;
    float* ndata = (float*)malloc(this->batchsize*this->depth*newheight*newwidth*sizeof(float));
    float* newdata = ndata;
    float* input = (float*)this->data;
    float h_scale = (float)this->height / newheight;
    float w_scale = (float)this->width / newwidth;
    for (int n = 0; n < this->batchsize; ++n) {
        for (int y = 0; y < newheight; ++y) {
            // float * yinput1 = input + y*this->width*this->depth;
            // float * yinput2 = input + (y+1)*this->width*this->depth;
            for (int x = 0; x < newwidth; ++x) {
                float rawi = x * w_scale;
                float rawj = y * h_scale;
                int i = rawi;
                int j = rawj;
                float u = rawi - i;
                float v = rawj - j;
                if((i+1) >= this->width || (j+1) >= this->height){
                    // newdata[y*newwidth + x] = input[j*this->width + i];
                    for(int c=0; c<this->depth; c++){
                        newdata[x*this->depth + c] = input[i*this->depth + c];
                    }
                }else{
                    for (int c = 0; c < this->depth; ++c) {
                        // float* tinput1 = input + j*this->width*this->depth + i * this->depth + c;
                        float t11 = input[i * this->depth + c];
                        float t12 = input[(i+1) * this->depth + c];
                        float* tinput2 = input + this->width * this->depth;
                        float t21 = tinput2[i * this->depth + c];
                        float t22 = tinput2[(i+1) * this->depth + c];
                        newdata[x*this->depth + c] = ((1-u)*(1-v)*t11+(1-u)*v*t12+u*(1-v)*t21+u*v*t22);
                    }


                }
            }
        input += this->width * this->depth;
        newdata += newwidth * this->depth;
        }
    }
    free(this->data);
    this->data = ndata;
    this->width = newwidth;
    this->height = newheight;
    return ret;
}

int TensorMat::resize(int width, int height, bool interpo){
    int ret = 0;
    /*if (this->dataType != AI_FLOAT) {
        DEBUG_LOG("only float type support");
        return 1;
    }*/
    if(interpo){
        if(this->layout == HWC || this->layout == WHC){
            ret = this->resize_bilinear_nhwc(width, height);
            // return ret;
        }else{
            ret = this->resize_bilinear_nchw(width, height);
            // return ret;
        }
    }else{
        if(this->layout == CHW){
            ret = this->resize_nearest_nchw(width, height);
            // return ret;
        }else{
            ret = this->resize_nearest_nhwc(width, height);
            // return ret;
        }
    }
    return ret;
};

int TensorMat::l2_norm(){
    int ret = 0;
    float squareSum = 0;
    float* pdata = (float*)this->data;
    int num = this->elementNum;
    for(int n=0; n<num; n++){
        squareSum += (pdata[n]*pdata[n]);
    }
    float sqrtSum = sqrt(squareSum);
    for(int n=0; n<num; n++){
        pdata[n] = pdata[n]/sqrtSum;
    }

    return ret;
};

int TensorMat::add(TensorMat * m){
    int ret = 0;
    if(this->batchsize != m->batchsize || this->depth != m->depth || this->width != m->width || this->height != m->height){
        //DEBUG_LOG("both shape must be same, [%dx%dx%dx%d] != [%dx%dx%dx%d]", \
                   this->batchsize, this->depth, this->height, this->width, m->batchsize, m->depth, m->height, m->width);
        return 1;
    }
    // int b,c,h,w;
    float *pdata = (float*)this->data;
    float *mdata = (float*)m->data;
    for(int i=0; i<this->elementNum; ++i){
        pdata[i] += mdata[i];
    }
    return 0;
};


#ifdef WITH_OPENCV
cv::Mat TensorMat::toMat(){
    cv::Mat CvOutMat;
    if(this->inlayout == NV12){
        printf("nv12 not support for now\n");
        // CvOutMat = cv::Mat(this->height, this->width, CV_8UC1);
        // CvOutMat.data = this->data;
        return CvOutMat;
    }else if(this->inlayout == RGB || this->inlayout == BGR{
        CvOutMat = cv::Mat(this->height, this->width, CV_8UC3);
        CvOutMat.data = this->data;
        return CvOutMat;
    }else if(this->inlayout == RGBA || this->inlayout == BGRA){
        CvOutMat = cv::Mat(this->height, this->width, CV_8UC4);
        CvOutMat.data = this->data;
        return CvOutMat;
    }else{
        printf("image inlayout not support for now\n");
        return CvOutMat;
    }
}
#endif

ImageMat::ImageMat(){}
ImageMat::ImageMat(int w, int h, int c, DATA_LAYOUT lout, INPUT_LAYOUT inlout,MODE mode, int timestamp, char* pdata)
{
    this->width = w;
    this->height = h;
    this->depth = c;
    this->data = nullptr;
    this->layout = lout;
    this->inlayout = inlout;
    this->mode = mode;
    this->timestamp = timestamp;
    int sz = 0;
    if(inlayout == NV12){
        sz = int(w*h*1.5);
        this->data = (unsigned char*)malloc(sz * sizeof(unsigned char));
        if(pdata !=NULL){
            memcpy(this->data, pdata, sz*sizeof(unsigned char));
        }
    }else{
        sz = w*h*c;
        this->data = (unsigned char*)malloc(sz * sizeof(unsigned char));
        if(pdata != NULL){
            memcpy(this->data, pdata, sz * sizeof(unsigned char));
        }
    }
}

ImageMat::ImageMat(ImgMat img){
    this->width = img.width;
    this->height = img.height;
    this->depth = img.depth;
    this->data = nullptr;
    //this->layout = img.layout;
    //this->inlayout = img.inlayout;
    this->frameID = img.frameID;
    // relayout = img.relayout;
    this->mode = img.mode;
    this->timestamp = img.timestamp;
    int sz = 0;
    if(this->inlayout == NV12){
        sz = int(width*height*1.5);
        this->data = (unsigned char*)malloc(sz * sizeof(unsigned char));
        memcpy(this->data, img.data, sz*sizeof(unsigned char));
    }else{
        sz = width*height*depth;
        this->data = (unsigned char*)malloc(sz * sizeof(unsigned char));
        memcpy(this->data, img.data, sz * sizeof(unsigned char));
    }
}


ImageMat::~ImageMat()
{
    // DEBUG_LOG("free ImageMat data\n");
    if(this->data){
        // DEBUG_LOG("free ImageMat %p", this->data);
        // DEBUG_LOG("freeing %dx%d", this->width, this->height);
        // DEBUG_LOG("in free\n");

        free(this->data);
        // DEBUG_LOG("out free\n");
        this->data = NULL;
    }
    // printf("after free\n");
}

int ImageMat::resize(int dstw, int dsth){
    int ret = 0;
    if(this->data == NULL){
        //DEBUG_LOG("TensorMat is empty, should init first");
        return 1;
    }
    unsigned char* newdata = NULL;
    if(this->inlayout == RGB || this->inlayout == BGR){
        newdata = (unsigned char*)malloc(dsth*dstw*3);
        resize_bilinear_c3((unsigned char*)this->data, this->width, this->height, newdata, dstw, dsth);
    }else if(this->inlayout == RGBA || this->inlayout == BGRA){
        newdata = (unsigned char*)malloc(dsth*dstw*4);
        resize_bilinear_c4((unsigned char*)this->data, this->width, this->height, newdata, dstw, dsth);
    }else if(this->inlayout == NV12 || this->inlayout == NV21){
        //DEBUG_LOG("NV12/NV21 data cannot resize for now");
        return 1;
    }
    delete(this->data);
    this->data = newdata;
    this->width = dstw;
    this->height = dsth;
    return 0;
};

ImageMat ImageMat::crop(int x1, int y1, int x2, int y2){
    int cropx1 = std::max(0, x1);
    int cropx2 = std::min(this->width, x2);
    int cropy1 = std::max(0, y1);
    int cropy2 = std::min(this->height, y2);

    int cropw = cropx2 - cropx1;
    int croph = cropy2 - cropy1;
    int elenum = cropw*croph;
    if(elenum <=0){
        ImageMat dst = ImageMat();
        return dst;
    }
    ImageMat dst(cropw, croph, this->depth, this->layout, this->inlayout, this->mode, this->timestamp, NULL);
    printf("dst.data %p\n", dst.data);
    unsigned char* dstp = (unsigned char*)dst.data;
    unsigned char* srcp = (unsigned char*)this->data + cropy1 * this->width*this->depth + cropx1 * this->depth;
    for(int i=0; i<croph; i++){
        memcpy(dstp, srcp, cropw*this->depth*sizeof(unsigned char));
        dstp += cropw*dst.depth;
        srcp += this->width * this->depth;
    }
    printf("dst.data %p\n", dst.data);
    return dst;
};

#ifdef WITH_OPENCV
CV::Mat ImageMat::toMat(){
    cv::Mat CvOutMat;
    if(this->inlayout == NV12){
        printf("nv12 not support for now\n");
        // CvOutMat = cv::Mat(this->height, this->width, CV_8UC1);
        // CvOutMat.data = this->data;
        return CvOutMat;
    }else if(this->inlayout == RGB || this->inlayout == BGR{
        CvOutMat = cv::Mat(this->height, this->width, CV_8UC3);
        CvOutMat.data = this->data;
        return CvOutMat;
    }else if(this->inlayout == RGBA || this->inlayout == BGRA){
        CvOutMat = cv::Mat(this->height, this->width, CV_8UC4);
        CvOutMat.data = this->data;
        return CvOutMat;
    }else{
        printf("image inlayout not support for now\n");
        return CvOutMat;
    }
}
#endif

int concat_batch(std::vector<TensorMat*> mats, TensorMat* dstMat){
    int ret = 0;
    int num = mats.size();
    if(num == 0){
        //DEBUG_LOG("ConCat error: input vector is empty");
        return 1;
    }
    int totalnum = 0;
    int subbatchsize = mats.at(0)->batchsize;
    int subdepth = mats.at(0)->depth;
    int subheight = mats.at(0)->height;
    int subwidth = mats.at(0)->width;
    DATATYPE dtype = mats.at(0)->dataType;
    for(auto m : mats){
        if(subdepth != m->depth || subheight != m->height || subwidth != m->width || dtype != m->dataType){
            //DEBUG_LOG("ConCat error: TensorMat attr not matched");
            return 1;
        }
        totalnum += m->batchsize;
    }

    if(dstMat == NULL){
        // dstMat = new TensorMat(totalnum, subdepth, subheight, subwidth, CHW, dtype);
        //DEBUG_LOG("ConCat error: dstMat is NULL");
        return 1;
    }
    if(dstMat->batchsize != totalnum || dstMat->depth != subdepth || dstMat->height != subheight || dstMat->width != subwidth || dstMat->dataType != dtype){
        //DEBUG_LOG("ConCat error: dstMat attr not matched");
        return 1;
    }
    char * pdstdata = (char*)dstMat->data;
    // int cpsz = mats.at(0).dataByte;
    for(int i=0; i<num; i++){
        TensorMat* src = mats.at(i);
        void* psrc = src->data;
        memcpy(pdstdata, psrc, src->dataByte);
        pdstdata += src->dataByte;
    }
    // dstMat->elementNum = num * subdepth * subheight * subwidth;
    // dstMat->dataByte = num * cpsz;

    return 0;
}

int concat_depth(std::vector<TensorMat*> mats, TensorMat * dstMat){
    int ret = 0;
    int num = mats.size();
    if(num == 0){
        //DEBUG_LOG("ConCat error: input vector is empty");
        return 1;
    }
    int totalnum = 0;
    int subbatchsize = mats.at(0)->batchsize;
    int subdepth = mats.at(0)->depth;
    int subheight = mats.at(0)->height;
    int subwidth = mats.at(0)->width;
    DATATYPE dtype = mats.at(0)->dataType;
    for(auto m : mats){
        if(subbatchsize != m->batchsize || subheight != m->height || subwidth != m->width || dtype != m->dataType){
            //DEBUG_LOG("ConCat error: TensorMat attr not matched");
            return 1;
        }
        totalnum += m->depth;
    }

    if(dstMat == NULL){
        //DEBUG_LOG("ConCat error: dstMat is NULL");
        return 1;
    }
    if(dstMat->batchsize != subbatchsize || dstMat->depth != totalnum || dstMat->height != subheight || dstMat->width != subwidth || dstMat->dataType != dtype){
        //DEBUG_LOG("ConCat error: dstMat attr not matched");
        return 1;
    }
    char* pdstdata = (char*)dstMat->data;
    int elesize = dstMat->dataByte / dstMat->elementNum;
    int dstStride = totalnum * subwidth * subheight * elesize;
    for(int b=0; b<subbatchsize; b++){
        char* subpdstdata = pdstdata + b*dstStride;
        for(int n=0; n<num; n++){
            TensorMat* src = mats.at(n);
            int batchstride = src->depth * subheight * subwidth * elesize;
            char* psrcdata = (char*)src->data + b*batchstride;
            memcpy(subpdstdata, psrcdata, batchstride);
            subpdstdata += batchstride;
        }
    }
    return 0;

}

int concat_height(std::vector<TensorMat*> mats, TensorMat * dstMat){
    int ret = 0;
    int num = mats.size();
    if(num == 0){
        //DEBUG_LOG("ConCat error: input vector is empty");
        return 1;
    }
    int totalnum = 0;
    int subbatchsize = mats.at(0)->batchsize;
    int subdepth = mats.at(0)->depth;
    int subheight = mats.at(0)->height;
    int subwidth = mats.at(0)->width;
    DATATYPE dtype = mats.at(0)->dataType;
    for(auto m : mats){
        if(subbatchsize != m->batchsize || subdepth != m->depth || subwidth != m->width || dtype != m->dataType){
            //DEBUG_LOG("ConCat error: TensorMat attr not matched");
            return 1;
        }
        totalnum += m->height;
    }

    if(dstMat == NULL){
        //DEBUG_LOG("ConCat error: dstMat is NULL");
        return 1;
    }
    if(dstMat->batchsize != subbatchsize || dstMat->depth != subdepth || dstMat->height != totalnum || dstMat->width != subwidth || dstMat->dataType != dtype){
        //DEBUG_LOG("ConCat error: dstMat attr not matched");
        return 1;
    }

    char* pdstdata = (char*)dstMat->data;
    int elesize = dstMat->dataByte / dstMat->elementNum;
    int dstStride = totalnum * subwidth * elesize;
    for(int b=0; b<subbatchsize; b++){
        for(int c=0; c < subdepth; c++){
            char* subpdstdata = pdstdata + b*subdepth*dstStride + c*dstStride;
            for(int n=0; n<num; n++){
                TensorMat* src = mats.at(n);
                int depthstride = src->height * subwidth * elesize;
                char* psrcdata = (char*)src->data + b*subdepth*depthstride + c*depthstride;
                memcpy(subpdstdata, psrcdata, depthstride);
                subpdstdata += depthstride;
            }
        }
    }
    // printf("dstMat %p\n", dstMat);
    return 0;
}

int concat_width(std::vector<TensorMat*> mats, TensorMat* dstMat){
    int ret = 0;
    int num = mats.size();
    if(num == 0){
        //DEBUG_LOG("ConCat error: input vector is empty");
        return 1;
    }
    int totalnum = 0;
    int subbatchsize = mats.at(0)->batchsize;
    int subdepth = mats.at(0)->depth;
    int subheight = mats.at(0)->height;
    int subwidth = mats.at(0)->width;
    DATATYPE dtype = mats.at(0)->dataType;
    for(auto m : mats){
        if(subbatchsize != m->batchsize || subdepth != m->depth || subheight != m->height || dtype != m->dataType){
            //DEBUG_LOG("ConCat error: TensorMat attr not matched");
            return 1;
        }
        totalnum += m->width;
    }

    if(dstMat == NULL){
        //DEBUG_LOG("ConCat error: dstMat is NULL");
        return 1;
    }
    if(dstMat->batchsize != subbatchsize || dstMat->depth != subdepth || dstMat->height != subheight || dstMat->width != totalnum || dstMat->dataType != dtype){
        //DEBUG_LOG("ConCat error: dstMat attr not matched");
        return 1;
    }
    char* pdstdata = (char*)dstMat->data;
    int elesize = dstMat->dataByte / dstMat->elementNum;
    for(int b=0; b<subbatchsize; b++){
        for(int c =0; c < subdepth; c++){
            for(int h=0; h<subheight; h++){
                char* subpdstdata = pdstdata + b*subdepth*subheight * totalnum * elesize + c*subheight*totalnum*elesize + h*totalnum*elesize;
                for(int n=0; n<num; n++){
                    TensorMat* src = mats.at(n);
                    int heightstride = src->width * elesize;
                    char* psrcdata = (char*)src->data + b*subdepth*subheight*heightstride + c*subheight* heightstride + h*heightstride;
                    memcpy(subpdstdata, psrcdata, heightstride);
                    subpdstdata += heightstride;
                }
            }
        }
    }
    return 0;
}

int concat(std::vector<TensorMat*> mats, int dim, TensorMat * dstMat){
    int ret = 0;
    int elenum = mats.at(0)->elementNum;
    for(auto m : mats){
        if(elenum != m->elementNum){
            //DEBUG_LOG("elenum should be same");
            return 1;
        }
    }
    int num = mats.size();

    // printf("go concat\n");
    if(dim == 0){
        ret = concat_batch(mats, dstMat);
    }else if(dim == 1){
        ret = concat_depth(mats, dstMat);
    }else if(dim == 2){
        ret = concat_height(mats, dstMat);
    }else if(dim == 3){
        ret = concat_width(mats, dstMat);
    }else{
        //DEBUG_LOG("dim error");
        return 1;
    }
    // printf("concated\n");
    if(ret){
        //DEBUG_LOG("concat error");
        return ret;
    }
    // printf("go return %p\n", dstMat);
    return 0;
};

// int concat(std::vector<std::shared_ptr<TensorMat>> pmats, int dim){
//     int ret = 0;
//     printf("-------------------------");
//     if(this->data != NULL){
//         DEBUG_LOG("self tensor should be empty");
//         return 1;
//     }
//     std::vector<TensorMat> mats;
//     int elenum = pmats.at(0)->elementNum;
//     for(auto m : pmats){
//         if(elenum != m->elementNum){
//             DEBUG_LOG("elenum should be same");
//             return 1;
//         }
//         mats.push_back(*(m.get()));
//     }
//     printf("go concat\n");
//     if(dim == 0){
//         ret = this->concat_batch(mats);
//     }else if(dim == 1){
//         ret = this->concat_depth(mats);
//     }else if(dim == 2){
//         ret = this->concat_height(mats);
//     }else if(dim == 3){
//         ret = this->concat_width(mats);
//     }else{
//         DEBUG_LOG("dim error");
//         return 1;
//     }
//     printf("concated\n");
//     if(ret){
//         DEBUG_LOG("concat error");
//         return ret;
//     }
//     printf("go return\n");
//     return 0;
// };