Skip to content

Commit

Permalink
remove old filever checks (+7 squashed commit)
Browse files Browse the repository at this point in the history
Squashed commit:

[b72627a] new format not working

[e568870] old ver works

[7053b77] compile errors fixed, fixing linkers

[4ae8889] add new ver

[ff82dfd] file format checks

[25b8aa8] refactoring type names

[931063b] still merging
  • Loading branch information
LostRuins committed May 20, 2023
1 parent 417302b commit c048bcf
Show file tree
Hide file tree
Showing 25 changed files with 3,166 additions and 492 deletions.
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,14 @@ target_compile_features(ggml_v1 PUBLIC c_std_11) # don't bump
target_link_libraries(ggml_v1 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON)

add_library(ggml_v2 OBJECT
otherarch/ggml_v2.c
otherarch/ggml_v2.h)
target_include_directories(ggml_v2 PUBLIC . ./otherarch ./otherarch/tools)
target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump
target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)

add_library(common2
examples/common.cpp
examples/common.h)
Expand Down
32 changes: 21 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,21 @@ ggml_clblast.o: ggml.c ggml.h
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CLBLAST_FLAGS) -c $< -o $@
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
ggml-opencl-legacy.o: ggml-opencl-legacy.c ggml-opencl-legacy.h

#version 2 libs
ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c $< -o $@
ggml_v2_openblas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(OPENBLAS_FLAGS) -c $< -o $@
ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
$(CC) $(CFLAGS) -c $< -o $@
ggml_v2_openblas_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
$(CC) $(CFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CLBLAST_FLAGS) -c $< -o $@
ggml_v2-opencl.o: otherarch/ggml_v2-opencl.cpp otherarch/ggml_v2-opencl.h
$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-opencl-legacy.h
$(CC) $(CFLAGS) -c $< -o $@

#extreme old version compat
Expand Down Expand Up @@ -264,19 +278,15 @@ main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
@echo '==== Run ./main -h for help. ===='
@echo

koboldcpp: ggml.o ggml_v1.o expose.o common.o gpttype_adapter.o $(OBJS)
koboldcpp: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(OBJS)
$(DEFAULT_BUILD)

koboldcpp_openblas: ggml_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o
$(OPENBLAS_BUILD)

koboldcpp_noavx2: ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o
$(OPENBLAS_BUILD)
koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o
$(NOAVX2_BUILD)

koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o
koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v2_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o
$(OPENBLAS_NOAVX2_BUILD)

koboldcpp_clblast: ggml_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml-opencl-legacy.o
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o
$(CLBLAST_BUILD)

quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o
Expand Down
4 changes: 2 additions & 2 deletions expose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ extern "C"
putenv((char*)deviceenv.c_str());
executable_path = inputs.executable_path;

if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4)
if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4 || file_format==FileFormat::GPTJ_5)
{
printf("\n---\nIdentified as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
ModelLoadResult lr = gpttype_load_model(inputs, file_format);
Expand Down Expand Up @@ -141,7 +141,7 @@ extern "C"
return true;
}
}
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5|| file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
{
printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
ModelLoadResult lr = gpttype_load_model(inputs, file_format);
Expand Down
127 changes: 62 additions & 65 deletions ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#define CL_TARGET_OPENCL_VERSION 110
#include <clblast.h>
#include <clblast_c.h>

#include <stdlib.h>
#include <stdio.h>
Expand All @@ -16,55 +17,45 @@
#define CL_DMMV_BLOCK_SIZE 32;

#define MULTILINE_QUOTE(...) #__VA_ARGS__
std::string program_source = MULTILINE_QUOTE(
static std::string program_source = MULTILINE_QUOTE(

typedef char int8_t;
typedef uchar uint8_t;
typedef int int32_t;
typedef uint uint32_t;

constant uint QK4_0 = 32;
constant uint QR4_0 = 2;
struct block_q4_0
{
half d;
uint8_t qs[QK4_0 / 2];
uint8_t qs[16];
};

constant uint QK4_1 = 32;
constant uint QR4_1 = 2;
struct block_q4_1
{
half d;
half m;
uint8_t qs[QK4_1 / 2];
uint8_t qs[16];
};

constant uint QK5_0 = 32;
constant uint QR5_0 = 2;
struct __attribute__ ((packed)) block_q5_0
{
half d;
uint32_t qh;
uint8_t qs[QK5_0 / 2];
uint8_t qs[16];
};

constant uint QK5_1 = 32;
constant uint QR5_1 = 2;
struct block_q5_1
{
half d;
half m;
uint32_t qh;
uint8_t qs[QK5_1 / 2];
uint8_t qs[16];
};

constant uint QK8_0 = 32;
constant uint QR8_0 = 1;
struct block_q8_0
{
half d;
uint8_t qs[QK8_0];
uint8_t qs[32];
};


Expand Down Expand Up @@ -135,13 +126,13 @@ void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const in
*v0 = vi0*d;
*v1 = vi1*d;
}
void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
static void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
*v0 = vload_half(0, &x[ib + 0]);
*v1 = vload_half(0, &x[ib + 1]);
}
);

std::string dequant_template = MULTILINE_QUOTE(
static std::string dequant_template = MULTILINE_QUOTE(
__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;

Expand All @@ -165,7 +156,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
}
);

std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
static std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
const int block_size = get_local_size(0);
const int row = get_global_id(0) / block_size;
Expand Down Expand Up @@ -207,29 +198,29 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
}
);

std::array<std::string, 5> dequant_str_keys = {
static std::array<std::string, 5> dequant_str_keys = {
"KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
};

std::array<std::string, 30> dequant_str_values = {
"dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
"dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
"dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
"dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
"dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
static std::array<std::string, 30> dequant_str_values = {
"dequantize_row_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0",
"dequantize_row_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1",
"dequantize_row_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0",
"dequantize_row_q5_1", "struct block_q5_1", "32", "2", "dequantize_q5_1",
"dequantize_row_q8_0", "struct block_q8_0", "32", "1", "dequantize_q8_0",
"convert_row_f16", "half", "1", "1", "convert_f16"
};

std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
"dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
"dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
"dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
"dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
"dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
static std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
"dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0",
"dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1",
"dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0",
"dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "32", "2", "dequantize_q5_1",
"dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "32", "1", "dequantize_q8_0",
"convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
};

std::string& replace(std::string& s, const std::string& from, const std::string& to) {
static std::string& sreplace(std::string& s, const std::string& from, const std::string& to) {
size_t pos = 0;
while ((pos = s.find(from, pos)) != std::string::npos) {
s.replace(pos, from.length(), to);
Expand All @@ -238,15 +229,15 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
return s;
}

std::string generate_kernels() {
static std::string generate_kernels() {
std::stringstream src;
src << program_source << '\n';
for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
std::string dequant_kernel = dequant_template;
std::string dmmv_kernel = dequant_mul_mat_vec_template;
for (size_t j = 0; j < dequant_str_keys.size(); j++) {
replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
sreplace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
sreplace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
}
src << dequant_kernel << '\n';
src << dmmv_kernel << '\n';
Expand All @@ -259,6 +250,7 @@ std::string generate_kernels() {
cl_int err_ = (err); \
if (err_ != CL_SUCCESS) { \
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n"); \
exit(1); \
} \
} while (0)
Expand All @@ -271,7 +263,7 @@ static cl_program program;
static cl_kernel convert_row_f16_cl;
static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
static bool fp16_support;
static bool fp16_support = false;

static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
cl_program p;
Expand Down Expand Up @@ -339,6 +331,8 @@ void ggml_cl_init(void) {
}
free(ext_buffer);
printf("Using Platform: %s Device: %s FP16: %d\n", platform_buffer, device_buffer, fp16_support);
fp16_support = false;
printf("CL FP16 temporarily disabled pending further optimization.\n");
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
CL_CHECK(err, "clCreateContext");
queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
Expand Down Expand Up @@ -552,17 +546,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr

// compute
cl_event ev_sgemm;
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
clblast::Transpose::kYes, clblast::Transpose::kNo,
ne01, ne11, ne10,
alpha,
d_X, 0, ne00,
d_Y, 0, ne10,
beta,
d_D, 0, ne01,
&queue, &ev_sgemm);
clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
(CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
ne01, ne11, ne10,
alpha,
d_X, 0, ne00,
d_Y, 0, ne10,
beta,
d_D, 0, ne01,
&queue, &ev_sgemm);

if (status != clblast::StatusCode::kSuccess) {
printf("\nF32 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);
GGML_ASSERT(false);
}

Expand Down Expand Up @@ -650,18 +645,19 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr

// compute
cl_event ev_sgemm;
clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
clblast::Transpose::kYes, clblast::Transpose::kNo,
ne01, ne11, ne10,
alpha,
d_X, 0, ne00,
d_Y, 0, ne10,
beta,
d_D, 0, ne01,
&queue, &ev_sgemm);
clblast::StatusCode status = (clblast::StatusCode)CLBlastHgemm((CLBlastLayout)clblast::Layout::kColMajor,
(CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
ne01, ne11, ne10,
alpha,
d_X, 0, ne00,
d_Y, 0, ne10,
beta,
d_D, 0, ne01,
&queue, &ev_sgemm);

if (status != clblast::StatusCode::kSuccess) {
GGML_ASSERT(false);
printf("\nF16 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);
GGML_ASSERT(false);
}

// copy dst to host, then convert to float
Expand Down Expand Up @@ -757,17 +753,18 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
CL_CHECK(clFinish(queue), "clFinish");

// compute
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
clblast::Transpose::kYes, clblast::Transpose::kNo,
ne01, ne11, ne10,
alpha,
d_X, 0, ne00,
d_Y, 0, ne10,
beta,
d_D, 0, ne01,
&queue, &ev_sgemm);
clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
(CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
ne01, ne11, ne10,
alpha,
d_X, 0, ne00,
d_Y, 0, ne10,
beta,
d_D, 0, ne01,
&queue, &ev_sgemm);

if (status != clblast::StatusCode::kSuccess) {
printf("\nQF32 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);
GGML_ASSERT(false);
}
}
Expand Down
22 changes: 22 additions & 0 deletions ggml-opencl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#pragma once

#include "ggml.h"

#ifdef __cplusplus
extern "C" {
#endif

void ggml_cl_init(void);

bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);

void * ggml_cl_host_malloc(size_t size);
void ggml_cl_host_free(void * ptr);

void ggml_cl_transform_tensor(struct ggml_tensor * tensor);

#ifdef __cplusplus
}
#endif
Loading

0 comments on commit c048bcf

Please sign in to comment.