Skip to content

Commit

Permalink
backup
Browse files Browse the repository at this point in the history
  • Loading branch information
airMeng committed May 16, 2024
1 parent 38f09be commit 4b561bd
Show file tree
Hide file tree
Showing 11 changed files with 508 additions and 861 deletions.
726 changes: 292 additions & 434 deletions ggml-sycl.cpp

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions ggml-sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
// TODO: these are temporary
// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();

// SYCL doesn't support registering host memory, keep here for reference
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
Expand Down
1 change: 0 additions & 1 deletion ggml-sycl/backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,5 @@
#include "dmmv.hpp"
#include "mmq.hpp"
#include "mmvq.hpp"
#include "pool.hpp"

#endif // GGML_SYCL_BACKEND_HPP
116 changes: 0 additions & 116 deletions ggml-sycl/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,122 +20,6 @@ int get_current_device_id() {
return dpct::dev_mgr::instance().current_device_id();
}

void log_ggml_var_device(
const char* name,
float* src,
size_t total_elements,
bool src_on_device) {
if (!g_ggml_sycl_debug)
return;
if (!src) {
printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
return;
}
char filename[1024];
sprintf(filename, "%s.txt", name);
printf("GGML Tensor:%s save to %s\n", name, filename);

size_t total_size = total_elements * sizeof(float);
float* local_buf = NULL;
if (src_on_device) {
local_buf = (float*)ggml_sycl_host_malloc(total_size);
ggml_sycl_set_device(g_main_device);
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
main_stream->memcpy(local_buf, src, total_size).wait();
} else {
local_buf = (float*)src;
}

std::ofstream logfile;
logfile.open(filename);
for (size_t i = 0; i < total_elements; i++) {
logfile << local_buf[i] << " ";
if ((i + 1) % 20 == 0)
logfile << std::endl;
}
logfile << std::endl;
logfile.close();

if (src_on_device)
ggml_sycl_host_free(local_buf);
}

void log_ggml_var_device_fp16(
const char* name,
sycl::half* src,
size_t total_elements,
bool src_on_device) {
if (!g_ggml_sycl_debug)
return;
if (!src) {
printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
return;
}
char filename[1024];
sprintf(filename, "%s.txt", name);
printf("GGML Tensor:%s save to %s\n", name, filename);

size_t total_size = total_elements * sizeof(sycl::half);
sycl::half* local_buf = NULL;
if (src_on_device) {
local_buf = (sycl::half*)ggml_sycl_host_malloc(total_size);
ggml_sycl_set_device(g_main_device);
dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0];
main_stream->memcpy(local_buf, src, total_size).wait();
} else {
local_buf = (sycl::half*)src;
}

std::ofstream logfile;
logfile.open(filename);
for (size_t i = 0; i < total_elements; i++) {
logfile << local_buf[i] << " ";
if ((i + 1) % 20 == 0)
logfile << std::endl;
}
logfile << std::endl;
logfile.close();

if (src_on_device)
ggml_sycl_host_free(local_buf);
}

void print_ggml_tensor(const char* name, struct ggml_tensor* src) {
if (!g_ggml_sycl_debug)
return;
if (!src) {
printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
return;
}

size_t total_elements = ggml_nelements(src);

const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU ||
src->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
float* src_data = NULL;
if (src_on_device) {
ggml_tensor_extra_gpu* src_extra = (ggml_tensor_extra_gpu*)src->extra;
src_data = (float*)src_extra->data_device[g_main_device];
} else {
src_data = (float*)src->data;
}

log_ggml_var_device(name, src_data, total_elements, src_on_device);
}

void log_tensor_with_cnt(
const char* name,
struct ggml_tensor* src,
int stop_cnt) {
stop_cnt = 4;
if (log_file_name_idx >= stop_cnt)
return;
char filename[1280];
sprintf(filename, "%s_%07d", name, log_file_name_idx);
log_file_name_idx++;
print_ggml_tensor(filename, src);
}

void* ggml_sycl_host_malloc(size_t size) try {
if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
return nullptr;
Expand Down
199 changes: 150 additions & 49 deletions ggml-sycl/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ static int g_work_group_size = 0;
#define GGML_SYCL_MMV_Y 1
#endif

typedef sycl::queue *queue_ptr;

enum ggml_sycl_backend_gpu_mode {
SYCL_UNSET_GPU_MODE = -1,
SYCL_SINGLE_GPU_MODE = 0,
Expand Down Expand Up @@ -182,17 +184,6 @@ static_assert(
#endif // GGML_SYCL_PEER_MAX_BATCH_SIZE

#define MUL_MAT_SRC1_COL_STRIDE 128
#define MAX_STREAMS 8
#define SYCL_MAX_DEVICES 48

static dpct::queue_ptr g_syclStreams[SYCL_MAX_DEVICES][MAX_STREAMS] = {{0}};

struct ggml_tensor_extra_gpu {
void* data_device[SYCL_MAX_DEVICES]; // 1 pointer for each device for split
// tensors
dpct::event_ptr events[SYCL_MAX_DEVICES]
[MAX_STREAMS]; // events for synchronizing multiple GPUs
};

class sycl_gpu_mgr {
public:
Expand Down Expand Up @@ -320,7 +311,7 @@ class sycl_gpu_mgr {
}
};

static sycl_gpu_mgr* g_sycl_gpu_mgr = NULL;
static sycl_gpu_mgr* g_sycl_gpu_mgr = new sycl_gpu_mgr(0);
static int g_device_count = -1;
static int g_all_sycl_device_count = -1;
static int g_main_device = -1;
Expand All @@ -329,31 +320,15 @@ static bool g_ggml_backend_sycl_buffer_type_initialized = false;

static std::array<float, SYCL_MAX_DEVICES> g_default_tensor_split = {};

static float g_tensor_split[SYCL_MAX_DEVICES] = {0};
static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};

static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
SYCL_UNSET_GPU_MODE;

struct sycl_device_capabilities {
int cc; // compute capability
bool vmm; // virtual memory support
size_t vmm_granularity; // granularity of virtual memory
int device_id;
};

static sycl_device_capabilities g_device_caps[SYCL_MAX_DEVICES] = {
{0, false, 0, -1}};

struct sycl_device_id2index {
int index;
};

static void* g_scratch_buffer = nullptr;
static size_t g_scratch_size = 0; // disabled by default
static size_t g_scratch_offset = 0;

static dpct::queue_ptr g_sycl_handles[SYCL_MAX_DEVICES] = {nullptr};

int get_main_device();

[[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) {
Expand Down Expand Up @@ -427,25 +402,151 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
std::exit(1);
}

void log_ggml_var_device(
const char* name,
float* src,
size_t total_elements,
bool src_on_device);

void log_ggml_var_device_fp16(
const char* name,
sycl::half* src,
size_t total_elements,
bool src_on_device);

// todo: debug for crash in some case
void print_ggml_tensor(const char* name, struct ggml_tensor* src);

static int log_file_name_idx = 0;
void log_tensor_with_cnt(
const char* name,
struct ggml_tensor* src,
int stop_cnt);
//////////////////////

struct ggml_sycl_device_info {
int device_count;

struct sycl_device_info {
int cc; // compute capability
// int nsm; // number of streaming multiprocessors
// size_t smpb; // max. shared memory per block
bool vmm; // virtual memory support
size_t total_vram;
};

sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};

std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
};

const ggml_sycl_device_info & ggml_sycl_info();

struct ggml_sycl_pool {
virtual ~ggml_sycl_pool() = default;

virtual void * alloc(size_t size, size_t * actual_size) = 0;
virtual void free(void * ptr, size_t size) = 0;
};

template<typename T>
struct ggml_sycl_pool_alloc {
ggml_sycl_pool * pool = nullptr;
T * ptr = nullptr;
size_t actual_size = 0;

explicit ggml_sycl_pool_alloc(ggml_sycl_pool & pool) : pool(&pool) {
}

ggml_sycl_pool_alloc(ggml_sycl_pool & pool, size_t size) : pool(&pool) {
alloc(size);
}

~ggml_sycl_pool_alloc() {
if (ptr != nullptr) {
pool->free(ptr, actual_size);
}
}

// size is in number of elements
T * alloc(size_t size) {
GGML_ASSERT(pool != nullptr);
GGML_ASSERT(ptr == nullptr);
ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
return ptr;
}

T * alloc(ggml_sycl_pool & pool, size_t size) {
this->pool = &pool;
return alloc(size);
}

T * get() {
return ptr;
}

ggml_sycl_pool_alloc() = default;
ggml_sycl_pool_alloc(const ggml_sycl_pool_alloc &) = delete;
ggml_sycl_pool_alloc(ggml_sycl_pool_alloc &&) = delete;
ggml_sycl_pool_alloc& operator=(const ggml_sycl_pool_alloc &) = delete;
ggml_sycl_pool_alloc& operator=(ggml_sycl_pool_alloc &&) = delete;
};

// backend interface

struct ggml_tensor_extra_gpu {
void* data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
// tensors
dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
[GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
};

struct ggml_backend_sycl_context {
int device;
std::string name;

queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
static sycl::handler * sycl_handles[GGML_SYCL_MAX_DEVICES] = {nullptr};

explicit ggml_backend_sycl_context(int device) :
device(device),
name(GGML_SYCL_NAME + std::to_string(device)) {
}

~ggml_backend_sycl_context() {
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; ++i) {
for (int j = 0; j < GGML_SYCL_MAX_STREAMS; ++j) {
if (streams[i][j] != nullptr) {
SYCL_CHECK(free(streams[i][j]));
}
}
if (cublas_handles[i] != nullptr) {
SYCL_CHECK(free(sycl_handles[i]));
}
}
}

queue_ptr stream(int device, int stream) {
if (qptrs[device][stream] == nullptr) {
SYCL_CHECK(dpct::get_current_device().create_queue(
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
}
return qptrs[device][stream];
}

cudaStream_t stream() {
return stream(device, 0);
}

cublasHandle_t sycl_handle(int device) {
if (sycl_handles[device] == nullptr) {
const dpct::queue_ptr stream = streams[device][0];
// create sycl handle
SYCL_CHECK(CHECK_TRY_ERROR(sycl_handles[device] = stream));
}
return sycl_handles[device];
}

cublasHandle_t sycl_handle() {
return sycl_handle(device);
}

// pool
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];

static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);

ggml_sycl_pool & pool(int device) {
if (pools[device] == nullptr) {
pools[device] = new_pool_for_device(qptrs[device][0], device);
}
return *pools[device];
}

ggml_sycl_pool & pool() {
return pool(device);
}
};


#endif // GGML_SYCL_COMMON_HPP
2 changes: 1 addition & 1 deletion ggml-sycl/convert.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "convert.hpp"
#include "dequantize.hpp"

#include "presets.hpp"

template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
Expand Down
Loading

0 comments on commit 4b561bd

Please sign in to comment.