You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
913 lines
33 KiB
913 lines
33 KiB
/*****************************************************************************
|
|
*
|
|
* Copyright (c) 2016-2026 by Sophgo Technologies Inc. All rights reserved.
|
|
*
|
|
* define class bmruntime with all runtime functions used locally, and will
|
|
* not export to users
|
|
*
|
|
*****************************************************************************/
|
|
|
|
#ifndef BMRUNTIME_H_
|
|
#define BMRUNTIME_H_
|
|
#include <mutex>
|
|
#include <thread>
|
|
#include <condition_variable>
|
|
#include <unordered_map>
|
|
#include "bmfunc/bmfunc.h"
|
|
//#include "bmcpu.h"
|
|
#include "bmruntime_common.h"
|
|
#include "bmruntime_profile.h"
|
|
|
|
#include "bmodel.hpp"
|
|
#include "bmlib_runtime.h"
|
|
#include <atomic>
|
|
|
|
using bmodel::CoeffMem;
|
|
using bmodel::ModelCtx;
|
|
using bmodel::NetParameter;
|
|
using flatbuffers::Offset;
|
|
using flatbuffers::Vector;
|
|
|
|
#ifdef _WIN32
|
|
#define DECL_EXPORT _declspec(dllexport)
|
|
#define DECL_IMPORT _declspec(dllimport)
|
|
#else
|
|
#define DECL_EXPORT
|
|
#define DECL_IMPORT
|
|
#endif
|
|
|
|
#ifndef __linux__
|
|
int bmrt_clock_gettime(int dummy, struct timespec* ct);
|
|
#endif
|
|
|
|
namespace bmruntime {
|
|
|
|
// class defined in this file.
|
|
class Bmruntime;
|
|
class BmCoeff;
|
|
class KernelModule;
|
|
|
|
struct BmMemory {
|
|
string desc; // description
|
|
bm_device_mem_t device_mem;
|
|
u8 check_code[bmodel::SHA256_LEN]; // sha256
|
|
u64 addr;
|
|
u32 bytes;
|
|
u32 dword_len;
|
|
bm_handle_t bm_handle;
|
|
bool do_check;
|
|
|
|
void Init(const string &desc, bm_handle_t handle, const bm_device_mem_t &mem,
|
|
void *buffer, bool do_check = false);
|
|
int Check();
|
|
};
|
|
|
|
struct single_core_command_t {
|
|
vector<int> gdma_id; // for static
|
|
vector<int> bdc_id; // for static
|
|
vector<u32> gdma_cmd_byte; // for static
|
|
vector<u32> bdc_cmd_byte; // for static
|
|
|
|
BmMemory gdma_mem; // for static
|
|
BmMemory bdc_mem; // for static
|
|
BmMemory hau_mem; // for static
|
|
BmMemory sdma_mem; // for static
|
|
u64 gdma_offset; // for static subnet
|
|
u64 bdc_offset; // for static subnet
|
|
|
|
BmMemory ir_mem; // for dynamic
|
|
u32 ir_offset; // for dynamic subnet
|
|
u32 ir_len; // for dynamic subnet
|
|
};
|
|
typedef struct subnet_tpu_info {
|
|
subnet_tpu_info()
|
|
{
|
|
core_commands.clear();
|
|
}
|
|
int is_dynamic;
|
|
std::vector<single_core_command_t> core_commands;
|
|
} SUBNET_TPU_INFO_T;
|
|
|
|
/* TODO: reuse cpu_layer_param_t */
|
|
typedef struct subnet_cpu_info {
|
|
subnet_cpu_info()
|
|
{
|
|
op_type = -1;
|
|
user_param = NULL;
|
|
param_size = 0;
|
|
}
|
|
int op_type;
|
|
void* user_param;
|
|
int param_size;
|
|
} SUBNET_CPU_INFO_T;
|
|
|
|
typedef struct {
|
|
/* for merge subnet, output[i] is selected from {input[output_from[i][0]...input[output_from[i][N]]} */
|
|
vector<vector<int>> output_from;
|
|
} SUBNET_MERGE_INFO_T;
|
|
|
|
typedef struct {
|
|
/* for switch subnet, output[i] is from input[output_from[i]] */
|
|
vector<int> output_from;
|
|
/* for switch subnet, for i-th output, output_branch[i]=0 means the output is to false branch, otherwise to true branch */
|
|
vector<int> output_branch;
|
|
bool valid;
|
|
} SUBNET_SWITCH_INFO_T;
|
|
|
|
#define SUBNET_MODE_TPU 0
|
|
#define SUBNET_MODE_CPU 1
|
|
#define SUBNET_MODE_MERGE 2
|
|
#define SUBNET_MODE_SWITCH 3
|
|
typedef struct {
|
|
int subnet_mode; /* 0 : tpu, 1: cpu */
|
|
/* union could not used if include extensible vector */
|
|
SUBNET_TPU_INFO_T tpu_info;
|
|
SUBNET_CPU_INFO_T cpu_info;
|
|
SUBNET_MERGE_INFO_T merge_info;
|
|
SUBNET_SWITCH_INFO_T switch_info;
|
|
|
|
/* per subnet i/o tensor */
|
|
vector<string> input_tensor_name_v;
|
|
vector<string> output_tensor_name_v;
|
|
|
|
int id;
|
|
vector<int> next_subnet_ids;
|
|
} SUBNET_INFO_T;
|
|
|
|
typedef struct {
|
|
bm_shape_t shape;
|
|
bm_store_mode_t st_mode;
|
|
bm_device_mem_t dev_mem;
|
|
u32 pad_h;
|
|
} tensor_attr_t;
|
|
|
|
typedef enum {
|
|
HOST_MEM_INVALID = 0,
|
|
HOST_MEM_ALLOC = 1, /* Allocated internal, need free */
|
|
HOST_MEM_MMAP = 2, /* Mmap from tensor dev_mem, need unmmap */
|
|
//HOST_MEM_OUTSIDE = 3, /* Memory from outside, do nothing ? */
|
|
} host_mem_type_t;
|
|
|
|
typedef struct {
|
|
void* addr;
|
|
u32 tensor_cpu_addr;
|
|
u64 size;
|
|
host_mem_type_t type;
|
|
} host_mem_t;
|
|
|
|
typedef enum {
|
|
MEM_TYPE_INVALID = 0,
|
|
MEM_TYPE_TPU = (1 << 0),
|
|
MEM_TYPE_CPU = (1 << 1),
|
|
} mem_type_t;
|
|
|
|
typedef enum {
|
|
TENSOR_TYPE_INVALID = 0,
|
|
TENSOR_TYPE_NET_INPUT = (1 << 0),
|
|
TENSOR_TYPE_NET_OUTPUT = (1 << 1),
|
|
TENSOR_TYPE_IMM_IO = (1 << 2),
|
|
} tensor_io_type_t;
|
|
|
|
/* record host mem in addition to device mem for
|
|
* cpu layer tensor.
|
|
*/
|
|
typedef struct {
|
|
bm_tensor_t tensor_info;
|
|
bm_shape_t max_shape;
|
|
host_mem_t host_mem;
|
|
int mem_type;
|
|
tensor_io_type_t io_type;
|
|
int io_index; /* index fro net input/output */
|
|
SUBNET_INFO_T* src_subnet; /* src subnet for imm i/o tensor */
|
|
int record_elem_num; /*if 0, do not use it, the real elem num can be compute from shape. if not 0, use it*/
|
|
unsigned int pad_h; /* pad_h for conv 3ic */
|
|
} tensor_ext_t;
|
|
|
|
struct net_stage_t {
|
|
vector<tensor_attr_t> input_v;
|
|
vector<tensor_attr_t> output_v;
|
|
u64 coeff_offset;
|
|
u64 ctx_start;
|
|
vector<u64> ctx_offset;
|
|
vector<u64> ctx_borders;
|
|
std::vector<bm_device_mem_u64_t> neuron_mem;
|
|
std::vector<u64> neuron_size;
|
|
|
|
std::vector<single_core_command_t> core_commands;
|
|
|
|
// io alone
|
|
u64 io_start;
|
|
u64 io_size;
|
|
u64 io_offset;
|
|
bm_device_mem_t io_mem;
|
|
|
|
// have multi subnet
|
|
int subnet_num; /* subnet num per net */
|
|
vector<SUBNET_INFO_T*> subnet_v; /* subnet per net */
|
|
|
|
/* subnet i/o tensor in addtion to net i/o tensor */
|
|
map<string, tensor_ext_t> subnet_tensor_v;
|
|
// save the profile info
|
|
vector<u8> net_profile;
|
|
// save the net stat info
|
|
vector<u8> net_stat;
|
|
// for cpu layer
|
|
u32 cpu_mem_size;
|
|
float* cpu_addr;
|
|
};
|
|
|
|
/* Record post dynamic alloc neuron usage info
|
|
detailed to each stage and core permutations
|
|
*/
|
|
struct dyn_neuron_stage_t {
|
|
vector<tensor_attr_t> input_v;
|
|
vector<tensor_attr_t> output_v;
|
|
vector<u64> ctx_offset;
|
|
std::vector<bm_device_mem_u64_t> neuron_mem;
|
|
|
|
map<string, tensor_ext_t> subnet_tensor_v;
|
|
float* cpu_addr;
|
|
};
|
|
|
|
struct net_ctx_t {
|
|
string net_name;
|
|
vector<string> input_name_v;
|
|
vector<bm_data_type_t> input_type_v;
|
|
vector<float> input_scale_v;
|
|
vector<int> input_zero_point_v;
|
|
vector<string> output_name_v;
|
|
vector<bm_data_type_t> output_type_v;
|
|
vector<float> output_scale_v;
|
|
vector<int> output_zero_point_v;
|
|
vector<net_stage_t *> stage_v; // each net has multi stages
|
|
std::unordered_map<size_t, dyn_neuron_stage_t *> dyn_neuron_stage_dict; // {neron_code: dyn_neuron_stage_info}
|
|
|
|
// Bulk neuron memories.
|
|
vector<bm_device_mem_u64_t> neuron_mem;
|
|
|
|
std::mutex neuron_mutex; // to avoid neuron mem used by other thread
|
|
bool is_dynamic = 0;
|
|
int core_num = 1;
|
|
int n_can_change = 0; // for dynamic
|
|
int h_w_can_change = 0; // for dynamic
|
|
vector<bm_device_mem_t> middlebuff_input; // for dynamic, one net share one middlebuf
|
|
vector<bm_device_mem_t> middlebuff_output; // for dynamic
|
|
bm_net_info_t net_info; // create for users by c interface
|
|
std::shared_ptr<KernelModule> kernel_module_;
|
|
|
|
// net with cascade
|
|
int32_t device_id = 0;
|
|
int32_t step_id = 0;
|
|
bool in_cascade = false;
|
|
int32_t addr_mode = 0;
|
|
vector<int> input_from; // input is loaded from which device
|
|
vector<int> input_hidden_v;
|
|
vector<int> input_index_v;
|
|
vector<int> output_hidden_v;
|
|
vector<int> output_index_v;
|
|
int32_t do_allreduce = 0;
|
|
tpu_kernel_allreduce_1684x_t allreduce_param;
|
|
};
|
|
|
|
// net with cascade
|
|
struct mem_cascade_t {
|
|
string name;
|
|
int32_t device_id;
|
|
bm_tensor_t tensor;
|
|
};
|
|
|
|
struct net_cascade_t {
|
|
string main_name; // net name
|
|
int num_device; // num device used
|
|
vector<vector<int>> step_ids; // each step of nets
|
|
std::vector<mem_cascade_t> hidden_inputs;
|
|
std::vector<mem_cascade_t> hidden_outputs;
|
|
std::vector<int> hidden_inputs_step_ids;
|
|
std::vector<int> hidden_outputs_step_ids;
|
|
|
|
vector<string> input_names;
|
|
vector<bm_data_type_t> input_types;
|
|
vector<float> input_scales;
|
|
vector<int> input_zps;
|
|
vector<bm_shape_t> input_shapes;
|
|
vector<size_t> input_bytes;
|
|
vector<bm_device_mem_t> input_mems;
|
|
vector<int> input_loc_devices;
|
|
|
|
vector<string> output_names;
|
|
vector<bm_data_type_t> output_types;
|
|
vector<float> output_scales;
|
|
vector<int> output_zps;
|
|
vector<bm_shape_t> output_shapes;
|
|
vector<size_t> output_bytes;
|
|
vector<bm_device_mem_t> output_mems;
|
|
vector<int> output_loc_devices;
|
|
int32_t addr_mode;
|
|
bool is_dynamic;
|
|
|
|
bm_net_info_t net_info;
|
|
};
|
|
|
|
class CascadeThread;
|
|
|
|
class Bmruntime {
|
|
public:
|
|
Bmruntime(bm_handle_t* bm_handle, bool user_initlized, const string& arch_name);
|
|
Bmruntime(const string& arch_name, int devid);
|
|
Bmruntime(bm_handle_t* bm_handles, int num_handles,
|
|
bool using_internal_hiddens, const string& arch_name);
|
|
~Bmruntime();
|
|
|
|
friend class BMProfile;
|
|
|
|
void set_debug_mode(int mode);
|
|
void set_bmrt_mmap(bool enable);
|
|
void subnet_time_print(bool enable);
|
|
bool load_context(const string& ctx_dir);
|
|
bool load_bmodel(const string& filepath);
|
|
bool load_bmodel(const void* bmodel_data, size_t size);
|
|
|
|
/* C++ style Interface */
|
|
const vector<string>* get_input_tensor(int net_idx) const;
|
|
const vector<string>* get_output_tensor(int net_idx) const;
|
|
const vector<u8>* get_net_profile(int net_idx, int stage_idx);
|
|
void init_output_tensors(net_ctx_t* net_ctx, net_stage_t* stage,
|
|
bm_tensor_t* output_tensors, bool user_mem, bool user_stmode);
|
|
|
|
|
|
/* C style Interface */
|
|
bool get_input_tensor(int net_idx, int* input_num, const char*** input_names) const;
|
|
bool get_output_tensor(int net_idx, int* output_num, const char*** output_names) const;
|
|
|
|
/* use full shape info */
|
|
bool launch(int net_idx, const int input_num, const bm_device_mem_t* input_mems,
|
|
int* input_shapes, int* input_dims, int* in_stmode, int output_num,
|
|
const bm_device_mem_t* output_mems, int* out_stmode, bm_shape_t * output_shapes = NULL);
|
|
bool launch(int net_idx, const bm_tensor_t *input_tensors, int input_num,
|
|
bm_tensor_t *output_tensors, int output_num, bool user_mem = false,
|
|
bool user_stmode = false);
|
|
bool launch_multi_cores(int net_idx, const bm_tensor_t *input_tensors,
|
|
int input_num, bm_tensor_t *output_tensors,
|
|
int output_num, const std::vector<int> &core_list,
|
|
bool user_mem, bool user_stmode);
|
|
bool launch_multi_cores(int net_idx, void* const input_datas[], const bm_shape_t input_shapes[],
|
|
int input_num, void* output_tensors[], bm_shape_t output_shapes[], int output_num,
|
|
bool user_mem = false, const std::vector<int>& core_list={});
|
|
bool launch(const net_cascade_t * net_c, const bm_tensor_t* input_tensors, int input_num,
|
|
bm_tensor_t* output_tensors, int output_num);
|
|
void pre_alloc_neuron_multi_cores(int net_idx, int stage_idx, const std::vector<int> &core_list);
|
|
bool memcpy_s2d_parallel(bm_tensor_t tensors[], void * datas[],
|
|
int tensor_num[], int device_num);
|
|
bool memcpy_d2s_parallel(void * datas[], bm_tensor_t tensors[],
|
|
int tensor_num[], int device_num);
|
|
bool memcpy_d2d_byte_parallel(bm_tensor_t dst_tensors[], size_t dst_offsets[],
|
|
bm_tensor_t src_tensors[], size_t src_offsets[],
|
|
size_t sizes[], int tensor_num[], int device_num);
|
|
bool memcpy_d2d_stride_ex_parallel(bm_tensor_t dst_tensors[],
|
|
size_t dst_offsets[],
|
|
bm_shape_t dst_strides[],
|
|
bm_tensor_t src_tensors[],
|
|
size_t src_offsets[],
|
|
bm_shape_t src_strides[],
|
|
bm_shape_t shapes[],
|
|
int tensor_num[],
|
|
int device_num);
|
|
|
|
const bm_shape_t* get_input_max_shape(int net_idx, int input_idx);
|
|
const bm_shape_t* get_output_max_shape(int net_idx, int output_idx);
|
|
int get_input_blob_max_shape(const string& tensor_name, int net_idx, int* shape);
|
|
int get_output_blob_max_shape(const string& tensor_name, int net_idx, int* shape);
|
|
|
|
// get input and output index by name
|
|
int get_input_index(const string& tensor_name, int net_idx);
|
|
int get_output_index(const string& tensor_name, int net_idx);
|
|
|
|
// data_type 0: FP32, 1: FP16, 2: INT8, 3: UINT8, 4: INT16, 5: UINT16
|
|
int get_input_data_type(const string& tensor_name, int net_idx);
|
|
int get_output_data_type(const string& tensor_name, int net_idx);
|
|
|
|
// store mode 0: 1N, 1: 2N, 2: 4N
|
|
int get_input_gmem_stmode(const string& tensor_name, int net_idx);
|
|
int get_output_gmem_stmode(const string& tensor_name, int net_idx);
|
|
|
|
/* COMMON */
|
|
bool can_batch_size_change(int net_idx);
|
|
bool can_height_and_width_change(int net_idx);
|
|
|
|
/* simple get/show */
|
|
void get_network_names(vector<const char*>* names);
|
|
|
|
void show_neuron_network();
|
|
|
|
/* flag get/set */
|
|
inline uint32_t get_flags() {
|
|
return m_flags;
|
|
}
|
|
|
|
inline void set_flags(uint32_t flags) {
|
|
m_flags = flags;
|
|
}
|
|
|
|
inline int get_network_number()
|
|
{
|
|
auto num_cascade = m_net_cascade_v.size();
|
|
auto num_net = m_net_ctx_v.size();
|
|
if (num_cascade != 0) {
|
|
for (auto v : m_net_ctx_v) {
|
|
if (v->in_cascade) {
|
|
num_net--;
|
|
}
|
|
}
|
|
}
|
|
return num_cascade + num_net;
|
|
}
|
|
|
|
inline bm_handle_t get_bm_handle(int device_idx=0)
|
|
{
|
|
return m_handles[device_idx];
|
|
}
|
|
inline int get_devid(int device_idx=0){
|
|
return m_devids[device_idx];
|
|
}
|
|
const net_cascade_t* get_net_cascade(const string& net_name);
|
|
bool cascade_thread_step(int net_idx,
|
|
std::vector<mem_cascade_t> *src,
|
|
std::vector<mem_cascade_t> *dst,
|
|
bm_handle_t m_handle);
|
|
bool cascade_thread_global_move_data(
|
|
int devid, bm_handle_t handle,
|
|
std::vector<tpu_kernel_global_move_1684x_t> *param);
|
|
int get_net_idx(const string& net_name);
|
|
const bm_net_info_t* get_net_info(int net_idx);
|
|
const bm_net_info_t* get_net_info(const string& net_name);
|
|
|
|
const vector<bm_device_mem_u64_t> &get_neuron_mem(int net_idx);
|
|
void trace();
|
|
|
|
size_t size_4N_align(const bm_shape_t& shape, const bm_data_type_t& type);
|
|
|
|
u64 must_alloc_device_mem(uint32_t devid, bm_device_mem_t* mem, u64 size, const string& desc = "", int type_len=1);
|
|
bm_device_mem_t must_alloc_device_mem(uint32_t devid, u64 size, const string& desc = "", int type_len=1);
|
|
void must_free_device_mem(uint32_t devid, bm_device_mem_t& mem);
|
|
|
|
// sg alloc for over 4GB
|
|
u64 must_alloc_device_mem_u64(uint32_t devid, bm_device_mem_u64_t* mem, u64 size, const string& desc = "", int type_len=1);
|
|
bm_device_mem_u64_t must_alloc_device_mem_u64(uint32_t devid, u64 size, const string& desc = "", int type_len=1);
|
|
void must_free_device_mem_u64(uint32_t devid, bm_device_mem_u64_t& mem);
|
|
|
|
protected:
|
|
void init();
|
|
void init_bmfunc(const string& arch_name);
|
|
void sync_cores(bm_handle_t handle, const std::vector<int32_t>& core_list);
|
|
bool launch_static(net_ctx_t* net_ctx, net_stage_t* stage, const bm_tensor_t* input_tensors,
|
|
int input_num, bm_tensor_t* output_tensors, int output_num,
|
|
const std::vector<int32_t> &core_list, const size_t dyn_core_mask);
|
|
bool launch_ir(net_ctx_t* net_ctx, net_stage_t* stage, const bm_tensor_t* input_tensors,
|
|
int input_num, bm_tensor_t* output_tensors, int output_num,
|
|
const size_t dyn_core_mask);
|
|
|
|
int get_stage_idx(const net_ctx_t* net_ctx, const bm_tensor_t* input_tensors);
|
|
int get_static_stage_idx(const net_ctx_t* net_ctx, const bm_tensor_t* input_tensors);
|
|
int get_dynamic_stage_idx(const net_ctx_t* net_ctx, const bm_tensor_t* input_tensors);
|
|
std::vector<int32_t> refine_core_list(const net_stage_t *stage,
|
|
const std::vector<int32_t> &core_list,
|
|
bm_handle_t handle);
|
|
|
|
protected:
|
|
// functions for load bmodel
|
|
u64 fix_gdma_addr(const net_stage_t* stage, u64 origin_addr, bool is_src);
|
|
void convert_cmd(u32* cmd, int engine_id, bool last_cmd, u64 start_address,
|
|
const net_stage_t* stage);
|
|
bool setup_cmd_context(ModelCtx* model_ctx, const bmodel::NetParameter *param,
|
|
net_stage_t* stage, uint32_t device_id);
|
|
bool setup_ir_context(ModelCtx* model_ctx, const bmodel::Binary* binary_ir,
|
|
const Vector<Offset<bmodel::StageIR>>* stage_ir, net_stage_t* stage, uint32_t device_id);
|
|
bool load_bmodel(ModelCtx*);
|
|
bool load_bmodel_net(ModelCtx*, int net_idx);
|
|
bool load_bmodel_net(ModelCtx*, int net_idx, net_ctx_t* net_ctx);
|
|
void load_tpu_module(ModelCtx*);
|
|
void load_cpu_module(ModelCtx*);
|
|
bool fill_net_ctx(
|
|
ModelCtx* model_ctx,
|
|
net_ctx_t* net_ctx, const Vector<Offset<NetParameter>>* params,
|
|
vector<vector<u64>> &stage_ctx_sizes, net_stage_t *stages);
|
|
void fill_subnet_dyn_neuron_tensor(
|
|
net_ctx_t* net_ctx, const size_t dyn_core_mask,
|
|
const net_stage_t *common_stage_info);
|
|
void net_ctx_alloc_dyn_neuron(net_ctx_t* net_ctx, const size_t dyn_core_mask,
|
|
const net_stage_t *common_stage_info, bool use_multi_subnet);
|
|
void fill_net_info(net_ctx_t* net_ctx);
|
|
void free_net_info(net_ctx_t* net_ctx);
|
|
void free_dyn_neuron(net_ctx_t* net_ctx);
|
|
void update_net_middlebuf(net_ctx_t *net_ctx);
|
|
void update_max_middlebuf_size(net_ctx_t* net_ctx);
|
|
void update_max_neuron_mem(uint32_t devid, const vector<u64> &sizes);
|
|
bool setup_profile_context(ModelCtx* model_ctx, net_stage_t* net_stage,
|
|
const bmodel::Binary* net_profile,
|
|
const bmodel::Binary* net_stat);
|
|
|
|
void set_profile_enabled(bool enable);
|
|
|
|
// functions for fill static bmdnn net info
|
|
void fill_tpu_net_info(net_ctx_t *net_ctx, net_stage_t *stage,
|
|
const bm_tensor_t *input_tensors, int input_num,
|
|
bm_tensor_t *output_tensors, int output_num,
|
|
const std::vector<int32_t> &core_list,
|
|
tpu_net_info_t &net_info,
|
|
const size_t dyn_core_mask);
|
|
template <typename T_stage>
|
|
void fill_tpu_tensor_info(vector<tpu_tensor_info_t> &tensor_info,
|
|
const T_stage *stage,
|
|
const bm_tensor_t *user_tensors, bool is_input);
|
|
void fill_tpu_cmd_info(std::vector<tpu_cmd_info_t> &cmd_info,
|
|
const net_stage_t *stage, const int32_t core_idx);
|
|
// function for fill tpu static subnet net info
|
|
template <typename T_stage>
|
|
void fill_tpu_tensor_info(vector<tpu_tensor_info_t> &tensor_info,
|
|
const T_stage *stage,
|
|
const SUBNET_INFO_T *subnet,
|
|
const bm_tensor_t *user_tensors, bool is_input);
|
|
void fill_tpu_cmd_info(std::vector<tpu_cmd_info_t> &cmd_info,
|
|
const SUBNET_INFO_T *subnet,
|
|
const int32_t core_idx);
|
|
// functions for cascade
|
|
void cascade_fill_net_info(net_cascade_t *net_cascade);
|
|
void cascade_free_net_info(net_cascade_t *net_cascade);
|
|
bool cascade_insert_net(int net_idx, net_ctx_t *net_ctx,
|
|
const string &main_name);
|
|
void cascade_update_all_info();
|
|
void cascade_update_input(net_cascade_t &v);
|
|
void cascade_update_output(net_cascade_t &v);
|
|
void cascade_update_max_hidden_buffer_size(net_cascade_t &v);
|
|
void cascade_update_hidden_buffer(net_cascade_t &v);
|
|
bm_tensor_t *
|
|
cascade_prepare_input(const string &name,
|
|
int32_t devid,
|
|
std::vector<mem_cascade_t> *src,
|
|
std::vector<mem_cascade_t> *dst);
|
|
bm_tensor_t *
|
|
cascade_prepare_output(const string &name, uint32_t devid,
|
|
std::vector<mem_cascade_t> *dst);
|
|
|
|
bool cascade_update_output_shape(net_ctx_t *net_ctx,
|
|
std::vector<mem_cascade_t> *dst,
|
|
std::vector<bm_tensor_t> out_tensors);
|
|
uint32_t get_dyn_core_mask(int stage_idx, const std::vector<int32_t> core_list);
|
|
std::vector<int> get_core_list_from_core_mask(uint32_t dyn_core_mask);
|
|
public:
|
|
api_info_t get_api_info(int net_idx, const bm_tensor_t *input_tensors,
|
|
int input_num, bm_tensor_t *output_tensors,
|
|
int output_num, bool user_mem, bool user_stmode,
|
|
uint32_t *core_ids);
|
|
|
|
protected: // one bmruntime can load nets at most
|
|
vector<net_ctx_t*> m_net_ctx_v;
|
|
vector<net_cascade_t> m_net_cascade_v; // net in cascade info
|
|
vector<std::shared_ptr<CascadeThread>> m_cascade_thread_v; // thread for cascade
|
|
|
|
static const int MAX_DEVICE_NUM = 32; // one bmruntime can run 32 device at most
|
|
bm_handle_t m_handles[MAX_DEVICE_NUM];
|
|
int m_device_num;
|
|
unsigned int m_core_num;
|
|
bool using_internal_hidden_tensors; /* internal initlized hidden_tensors device_mem or accept from user parameter when launch */
|
|
bool using_internal_bm_handle; /* internal initlized bm_handle or accept from user parameter */
|
|
int m_devids[MAX_DEVICE_NUM];
|
|
bool using_fast_allreduce;
|
|
|
|
vector<bm_device_mem_t> m_device_mem_vec; /* save device memory address, for free */
|
|
vector<uint32_t> m_device_mem_ids; /* record each device memory belong which device*/
|
|
|
|
vector<bm_device_mem_u64_t> m_sg_device_mem_vec; /* save device memory address, for free */
|
|
vector<uint32_t> m_sg_device_mem_ids; /* record each device memory belong which device*/
|
|
|
|
std::shared_ptr<BmCoeff> m_local_coeffs[MAX_DEVICE_NUM];
|
|
static map<int, std::shared_ptr<BmCoeff>> m_global_coeff_map;
|
|
static std::mutex m_global_coeff_mutex;
|
|
|
|
static map<vector<u8>, std::unique_ptr<uint8_t[]>> m_global_cpu_const_map;
|
|
static std::mutex m_global_cpu_const_mutex;
|
|
|
|
std::mutex m_load_mutex;
|
|
|
|
bool b_enable_mmap;
|
|
bool m_subnet_time_print;
|
|
uint32_t m_flags;
|
|
|
|
std::shared_ptr<BMProfile> m_profile;
|
|
|
|
// For middle buffer
|
|
// Because max_middle_buffer is also record in m_device_mem_vec.
|
|
// So we do not need to free max_middle_buffer at last.
|
|
bm_device_mem_t max_middle_buffer[MAX_DEVICE_NUM];
|
|
u64 max_middle_buffer_size[MAX_DEVICE_NUM];
|
|
u32 middle_buffer_num[MAX_DEVICE_NUM];
|
|
|
|
// For hidden buffer
|
|
// Because max_hidden_buffer is also record in m_device_mem_vec.
|
|
// So we do not need to free max_hidden_buffer at last.
|
|
bm_device_mem_t max_hidden_buffer[MAX_DEVICE_NUM];
|
|
u64 max_hidden_buffer_size[MAX_DEVICE_NUM];
|
|
u32 hidden_buffer_num[MAX_DEVICE_NUM];
|
|
|
|
// For neuron memory share
|
|
u32 m_neuron_heap_mask;
|
|
vector<bm_device_mem_u64_t> max_neuron_mem[MAX_DEVICE_NUM];
|
|
std::shared_ptr<KernelModule> kernel_modules[MAX_DEVICE_NUM];
|
|
|
|
protected:
|
|
/* functions for subnet */
|
|
void bmcpu_setup();
|
|
void bmtpu_setup();
|
|
bool launch_cpu_subnet(net_ctx_t* net_ctx, map<string, tensor_ext_t> *subnet_tensor_v, const SUBNET_INFO_T* subnet,
|
|
const bm_tensor_t* input_tensors, bm_shape_t real_out_shape[]);
|
|
bool launch_tpu_subnet(net_ctx_t* net_ctx, net_stage_t* stage, const SUBNET_INFO_T* subnet,
|
|
const bm_tensor_t* input_tensors, int input_num,
|
|
bm_tensor_t* output_tensors, int output_num,
|
|
const uint32_t dyn_core_mask);
|
|
bool launch_tpu_ir_subnet(net_ctx_t* net_ctx, net_stage_t* stage, const SUBNET_INFO_T* subnet,
|
|
const bm_tensor_t* input_tensors, const int* input_elem_num, int input_num,
|
|
bm_tensor_t* output_tensors, int* output_elem_num, int output_num,
|
|
const uint32_t dyn_core_mask);
|
|
bool launch_multi_subnet(net_ctx_t* net_ctx, net_stage_t* stage, const bm_tensor_t* input_tensors,
|
|
int input_num, bm_tensor_t* output_tensors, int output_num,
|
|
const uint32_t dyn_core_mask);
|
|
void fill_sub_net(ModelCtx* model_ctx, const Vector<Offset<bmodel::SubNet>>* subnet_set_v,
|
|
net_ctx_t* net_ctx, net_stage_t* net_stage);
|
|
void fill_subnet_tensor_map(net_ctx_t* net_ctx, net_stage_t* net_stage, SUBNET_INFO_T* subnet,
|
|
const Vector<Offset<bmodel::Tensor>>* tensor_set_v, bool is_input,
|
|
std::set<string> subnet_switch_inputs);
|
|
void subnet_clear(net_ctx_t* net_ctx);
|
|
void subnet_tensor_s2d(uint32_t devid, map<string, tensor_ext_t> *subnet_tensor_v, const string& tensor_name,
|
|
bm_device_mem_t* out_dev_mem = NULL, u64 offset = 0, u64 size = 0);
|
|
void* subnet_tensor_d2s(uint32_t devid, map<string, tensor_ext_t> *subnet_tensor_v, const string& tensor_name,
|
|
bm_device_mem_t* out_dev_mem = NULL, u64 offset = 0, u64 size = 0);
|
|
void subnet_tensor_forward(uint32_t devid, map<string, tensor_ext_t> *subnet_tensor_v, const string& src_tensor, const string& dst_tensor, const bm_tensor_t* output_tensors);
|
|
|
|
protected:
|
|
typedef void* (*t_bmcpu_init)();
|
|
typedef void (*t_bmcpu_uninit)(void*);
|
|
typedef void (*t_bmcpu_process)(void*, int, void*, int, const vector<float*>&,
|
|
const vector<vector<int>>&, const vector<float*>&,
|
|
vector<vector<int>>&);
|
|
void* bmcpu_handle_;
|
|
t_bmcpu_init bmcpu_init_;
|
|
t_bmcpu_uninit bmcpu_uninit_;
|
|
t_bmcpu_process bmcpu_process_;
|
|
|
|
void* customcpu_handle_ = NULL;
|
|
t_bmcpu_init customcpu_init_ = NULL;
|
|
t_bmcpu_uninit customcpu_uninit_ = NULL;
|
|
t_bmcpu_process customcpu_process_ = NULL;
|
|
|
|
std::shared_ptr<KernelModule> kernel_module_;
|
|
|
|
private:
|
|
bmfunc* p_bmfunc;
|
|
|
|
// temp custom cpu related
|
|
void *tmpcpuso_handle_ = NULL;
|
|
std::string temp_filename_;
|
|
int card_chip_num;
|
|
};
|
|
|
|
class BmCoeff {
|
|
public:
|
|
explicit BmCoeff(bm_handle_t handle);
|
|
explicit BmCoeff(int devid);
|
|
~BmCoeff();
|
|
|
|
u64 Register(ModelCtx* model_ctx, const CoeffMem* coeff_mem);
|
|
int Check();
|
|
bm_device_mem_u64_t GetCoeffDeviceMem() {
|
|
return m_latest_device_mem;
|
|
}
|
|
|
|
protected:
|
|
map<vector<u8>, bm_device_mem_u64_t> m_coeff_map; /* to share the same coeff, by check code*/
|
|
std::mutex m_coeff_mutex;
|
|
bm_handle_t m_handle;
|
|
bool m_inner_handle;
|
|
int m_devid;
|
|
bm_device_mem_u64_t m_latest_device_mem;
|
|
};
|
|
|
|
class KernelModule {
|
|
public:
|
|
explicit KernelModule(bm_handle_t &handle):m_handle(handle) {}
|
|
~KernelModule();
|
|
private:
|
|
void preload_funcs(int core_id);
|
|
public:
|
|
void add_core_module(int core_id, const unsigned char* binary, size_t size);
|
|
void add_core_module(int core_id, const char* filename);
|
|
vector<tpu_kernel_function_t> get_multi_fullnet_func_id(const vector<int>& core_list);
|
|
vector<tpu_kernel_function_t> get_dynamic_fullnet_func_id(const vector<int>& core_list);
|
|
vector<tpu_kernel_function_t> get_enable_profile_func_id(const vector<int>& core_list);
|
|
vector<tpu_kernel_function_t> get_get_profile_func_id(const vector<int>& core_list);
|
|
vector<tpu_kernel_function_t> get_set_engine_profile_param_func_id(const vector<int>& core_list);
|
|
vector<tpu_kernel_function_t> get_global_move_1684x_func_id(const vector<int>& core_list);
|
|
|
|
private:
|
|
bm_handle_t m_handle;
|
|
map<int, tpu_kernel_module_t> _kernel_modules;
|
|
map<int, tpu_kernel_function_t> _multi_fullnet_func_id;
|
|
map<int, tpu_kernel_function_t> _dynamic_fullnet_func_id;
|
|
map<int, tpu_kernel_function_t> _enable_profile_func_id;
|
|
map<int, tpu_kernel_function_t> _get_profile_func_id;
|
|
map<int, tpu_kernel_function_t> _set_engine_profile_param_func_id;
|
|
map<int, tpu_kernel_function_t> _global_move_1684x_func_id;
|
|
};
|
|
|
|
class CascadeThread {
|
|
typedef enum {
|
|
NET_MODE = 0,
|
|
S2D_MODE = 1,
|
|
D2S_MODE = 2,
|
|
D2D_MODE = 3,
|
|
D2D_STRIDE_EX_MODE = 4,
|
|
UNKNOWN = -1,
|
|
} FUNC_MODE_T;
|
|
public:
|
|
CascadeThread(Bmruntime *rt, bm_handle_t handle)
|
|
: m_stop(false), m_paramReady(false), m_done(true),
|
|
m_ok(true), m_handle(handle), m_rt(rt)
|
|
{
|
|
m_worker = std::thread(&CascadeThread::threadFunction, this);
|
|
}
|
|
|
|
~CascadeThread() {
|
|
{
|
|
// std::unique_lock<std::mutex> lock(m_mutex);
|
|
m_done = false;
|
|
m_stop = true;
|
|
// m_condition.notify_all();
|
|
while(m_done == false) {std::this_thread::yield();}
|
|
}
|
|
if (m_worker.joinable()) {
|
|
m_worker.join();
|
|
}
|
|
}
|
|
|
|
void run(int net_idx,
|
|
vector<mem_cascade_t> *src,
|
|
vector<mem_cascade_t> *dst) {
|
|
// std::unique_lock<std::mutex> lock(m_mutex);
|
|
m_net_idx = net_idx;
|
|
m_src = src;
|
|
m_dst = dst;
|
|
m_mode = NET_MODE;
|
|
m_done = false;
|
|
m_paramReady = true;
|
|
// m_condition.notify_all();
|
|
}
|
|
|
|
void s2d(bm_tensor_t *tensors, void **datas, int tensor_num) {
|
|
// std::unique_lock<std::mutex> lock(m_mutex);
|
|
m_dst_tensors = tensors;
|
|
m_datas = datas;
|
|
m_mode = S2D_MODE;
|
|
m_tensor_num = tensor_num;
|
|
m_done = false;
|
|
m_paramReady = true;
|
|
// m_condition.notify_all();
|
|
}
|
|
|
|
void d2s(void **datas, bm_tensor_t *tensors, int tensor_num) {
|
|
// std::unique_lock<std::mutex> lock(m_mutex);
|
|
m_src_tensors = tensors;
|
|
m_datas = datas;
|
|
m_mode = D2S_MODE;
|
|
m_tensor_num = tensor_num;
|
|
m_done = false;
|
|
m_paramReady = true;
|
|
// m_condition.notify_all();
|
|
}
|
|
|
|
void d2d(bm_tensor_t *dst_tensors, size_t *dst_offsets,
|
|
bm_tensor_t *src_tensors, size_t *src_offsets,
|
|
size_t *sizes, int tensor_num) {
|
|
// std::unique_lock<std::mutex> lock(m_mutex);
|
|
m_src_tensors = src_tensors;
|
|
m_dst_tensors = dst_tensors;
|
|
m_src_offsets = src_offsets;
|
|
m_dst_offsets = dst_offsets;
|
|
m_sizes = sizes;
|
|
m_mode = D2D_MODE;
|
|
m_tensor_num = tensor_num;
|
|
m_done = false;
|
|
m_paramReady = true;
|
|
// m_condition.notify_all();
|
|
}
|
|
|
|
void d2d_stride_ex(int devid,
|
|
std::vector<tpu_kernel_global_move_1684x_t> *params) {
|
|
// std::unique_lock<std::mutex> lock(m_mutex);
|
|
m_devid = devid;
|
|
m_global_move_params = params;
|
|
m_mode = D2D_STRIDE_EX_MODE;
|
|
m_done = false;
|
|
m_paramReady = true;
|
|
// m_condition.notify_all();
|
|
}
|
|
|
|
bool sync() {
|
|
// std::unique_lock<std::mutex> lock(m_mutex);
|
|
// m_doneCondition.wait(lock, [this]() { return m_done; });
|
|
while(m_done == false) {std::this_thread::yield();}
|
|
return m_ok;
|
|
}
|
|
|
|
private:
|
|
void threadFunction() {
|
|
while (true) {
|
|
// std::unique_lock<std::mutex> lock(m_mutex);
|
|
// m_condition.wait(lock, [this]() { return m_paramReady || m_stop; });
|
|
// BMRT_LOG(INFO, "M_MODE is %d\n", m_mode);
|
|
while (m_paramReady == false && m_stop == false) {std::this_thread::yield();}
|
|
if (m_stop) {
|
|
m_done = true;
|
|
return;
|
|
}
|
|
if (m_mode == NET_MODE) {
|
|
m_ok = m_rt->cascade_thread_step(m_net_idx, m_src, m_dst, m_handle);
|
|
if (m_ok) {
|
|
auto status = bm_thread_sync(m_handle);
|
|
m_ok = BM_SUCCESS == status;
|
|
}
|
|
} else if (m_mode == S2D_MODE) {
|
|
for (int i = 0; i < m_tensor_num; ++i) {
|
|
auto status = bm_memcpy_s2d(m_handle, m_dst_tensors[i].device_mem, m_datas[i]);
|
|
if (BM_SUCCESS != status) {
|
|
m_ok = false;
|
|
break;
|
|
} else {
|
|
m_ok = true;
|
|
}
|
|
}
|
|
} else if (m_mode == D2S_MODE) {
|
|
for (int i = 0; i < m_tensor_num; ++i) {
|
|
auto status = bm_memcpy_d2s(m_handle, m_datas[i], m_src_tensors[i].device_mem);
|
|
if (BM_SUCCESS != status) {
|
|
m_ok = false;
|
|
break;
|
|
} else {
|
|
m_ok = true;
|
|
}
|
|
}
|
|
} else if (m_mode == D2D_MODE) {
|
|
for (int i = 0; i < m_tensor_num; ++i) {
|
|
auto status = bm_memcpy_d2d_byte(m_handle, m_dst_tensors[i].device_mem, m_dst_offsets[i],
|
|
m_src_tensors[i].device_mem, m_src_offsets[i], m_sizes[i]);
|
|
if (BM_SUCCESS != status) {
|
|
m_ok = false;
|
|
break;
|
|
} else {
|
|
m_ok = true;
|
|
}
|
|
}
|
|
} else if (m_mode == D2D_STRIDE_EX_MODE) {
|
|
// global move
|
|
m_ok = m_rt->cascade_thread_global_move_data(m_devid, m_handle,
|
|
m_global_move_params);
|
|
}
|
|
m_paramReady = false;
|
|
m_done = true;
|
|
// m_doneCondition.notify_one();
|
|
}
|
|
}
|
|
|
|
std::thread m_worker;
|
|
// std::mutex m_mutex;
|
|
// std::condition_variable m_condition;
|
|
// std::condition_variable m_doneCondition;
|
|
std::atomic_bool m_stop;
|
|
std::atomic_bool m_paramReady;
|
|
std::atomic_bool m_done;
|
|
// bool m_stop;
|
|
// bool m_paramReady;
|
|
// bool m_done;
|
|
bool m_ok;
|
|
// s2d/d2s/d2d param
|
|
bm_tensor_t *m_src_tensors;
|
|
bm_tensor_t *m_dst_tensors;
|
|
void **m_datas;
|
|
FUNC_MODE_T m_mode;
|
|
int m_tensor_num;
|
|
size_t *m_src_offsets;
|
|
size_t *m_dst_offsets;
|
|
size_t *m_sizes;
|
|
// net param
|
|
int m_net_idx;
|
|
bm_handle_t m_handle;
|
|
Bmruntime *m_rt;
|
|
vector<mem_cascade_t> *m_src;
|
|
vector<mem_cascade_t> *m_dst;
|
|
// d2d_stride_ex param
|
|
int m_devid;
|
|
std::vector<tpu_kernel_global_move_1684x_t> *m_global_move_params;
|
|
};
|
|
|
|
} // namespace bmruntime
|
|
|
|
#endif
|