Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP, don't merge] unity.cpp -> ggml master #719

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix initialization before padding
  • Loading branch information
cndn committed Jan 27, 2024
commit e394ec93c741b1a19c610c7b7710d19f4ae154f0
12 changes: 10 additions & 2 deletions examples/unity/fairseq2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -810,14 +810,22 @@ extern "C" ggml_tensor* StandardConformerEncoderAdaptorLayer_forward(
ggml_tensor* residual = seqs;
residual = LayerNorm_forward(model, prefix + ".residual_layer_norm", residual);
residual = ggml_dup(ctx, ggml_permute(ctx, residual, 1, 0, 2, 3));
residual = ggml_conv_1d(ctx, model.tensors[prefix + ".residual_conv.weight"], residual, 8, 4, 1);
ggml_tensor* residual_conv_weight = model.tensors[prefix + ".residual_conv.weight"];
// ggml_tensor* from = model.tensors[prefix + ".residual_conv.weight"];
// FORCE_ALLOC(residual_conv_weight, ctx, ggml_new_tensor_3d(ctx, GGML_TYPE_F16, from->ne[0], from->ne[1], from->ne[2]));
// ggml_fp32_to_fp16_row((float*)model.tensors[prefix + ".residual_conv.weight"]->data, (ggml_fp16_t*)residual_conv_weight->data, from->ne[0] * from->ne[1] * from->ne[2]);
residual = ggml_conv_1d(ctx, residual_conv_weight, residual, 8, 4, 1);
residual = ggml_dup(ctx, ggml_permute(ctx, residual, 1, 0, 2, 3));
residual = ggml_add_inplace(ctx, ggml_repeat(ctx, model.tensors[prefix + ".residual_conv.bias"], residual), residual);
residual = ggml_glu(ctx, residual);

seqs = LayerNorm_forward(model, prefix + ".self_attn_layer_norm", seqs);
seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
seqs = ggml_conv_1d(ctx, model.tensors[prefix + ".self_attn_conv.weight"], seqs, 8, 4, 1);
ggml_tensor* self_attn_conv_weight = model.tensors[prefix + ".self_attn_conv.weight"];
// from = model.tensors[prefix + ".self_attn_conv.weight"];
// FORCE_ALLOC(self_attn_conv_weight, ctx, ggml_new_tensor_3d(ctx, GGML_TYPE_F16, from->ne[0], from->ne[1], from->ne[2]));
// ggml_fp32_to_fp16_row((float*)model.tensors[prefix + ".self_attn_conv.weight"]->data, (ggml_fp16_t*)residual_conv_weight->data, from->ne[0] * from->ne[1] * from->ne[2]);
seqs = ggml_conv_1d(ctx, self_attn_conv_weight, seqs, 8, 4, 1);
seqs = ggml_dup(ctx, ggml_permute(ctx, seqs, 1, 0, 2, 3));
seqs = ggml_add_inplace(ctx, seqs, ggml_repeat(ctx, model.tensors[prefix + ".self_attn_conv.bias"], seqs));
seqs = ggml_glu(ctx, seqs);
Expand Down
7 changes: 6 additions & 1 deletion examples/unity/lib/unity_lib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ extern "C" Result unity_eval_speech(fairseq2_model& model, std::vector<float>& d
Result result;
// The ctx_size_mb mostly depends of input length and model dim.
int ctx_size_mb = opts.mem_mb;
auto encoder_buf = std::vector<uint8_t>(8 * 1024 * 1024); // this is only for tensor metadata, it can be small
auto encoder_buf = std::vector<uint8_t>(80 * 1024 * 1024); // this is only for tensor metadata, it can be small
auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024);
ggml_allocr* fwd_alloc = ggml_allocr_new(encoder_fwd_buf.data(), encoder_fwd_buf.capacity(), 8);
int tgt_lang_idx;
Expand Down Expand Up @@ -97,6 +97,11 @@ extern "C" Result unity_eval_speech(fairseq2_model& model, std::vector<float>& d
ggml_graph_compute_with_ctx(model.ctx, gf, n_threads);
// encoder_output is valid until we call `ggml_allocr_reset(fwd_alloc)`
ggml_tensor* encoder_output = gf->nodes[gf->n_nodes - 1];
// for(int i = 0; i < 100; i++) {
// float* ptr = static_cast<float*>(encoder_output->data);
// printf("%4f ", ptr[i]);
// }
// exit(0);

// Beam search decoding
const Hypothesis* hypo = unity_decode(model, opts, tgt_lang_idx, encoder_output, n_threads);
Expand Down
16 changes: 0 additions & 16 deletions examples/unity/model_loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,25 +137,18 @@ void model_loader::load_vocab(llama_vocab& vocab, std::ifstream &fin)
if (vocab_size == 0) {
return;
}
printf("load vocab 0\n");

vocab.token_to_id.reserve(vocab_size);
vocab.id_to_token.reserve(vocab_size);

std::string packed_vocab = get_name(fin);
std::int64_t ctx_size = vocab_size * sizeof(float) + vocab_size + 2 * ggml_tensor_overhead();
ctx_size *= 2;
printf("load vocab 1\n");
ggml_context* ctx = ggml_init(ggml_init_params{static_cast<size_t>(ctx_size), nullptr, false});
printf("load vocab 1.1\n");
ggml_tensor* lengths_tensor = load_tensor_value(fin, ctx, true);
printf("load vocab 1.2\n");
std::int8_t* lengths = (std::int8_t*)lengths_tensor->data;
printf("load vocab 1.3\n");
ggml_tensor* scores_tensor = load_tensor_value(fin, ctx, true);
printf("load vocab 1.4\n");
float* scores = ggml_get_data_f32(scores_tensor);
printf("load vocab 2\n");

int64_t offset = 0;
for (int i = 0; i < vocab_size; ++i) {
Expand All @@ -165,7 +158,6 @@ void model_loader::load_vocab(llama_vocab& vocab, std::ifstream &fin)
vocab.id_to_token.push_back({word, scores[i], LLAMA_TOKEN_TYPE_NORMAL});
offset += lengths[i] + 1;
}
printf("load vocab 3\n");
// Since we copied lengths and scores, we don't need the context anymore.
ggml_free(ctx);

Expand Down Expand Up @@ -224,22 +216,14 @@ model_loader::get_name(std::ifstream& fin)

extern "C" int load_fairseq2_ggml_file(fairseq2_model& model, const char* fname) {
model_loader loader;
printf("here 1\n");
assert_endianness();
printf("here 2\n");
auto fin = open_ggml_file(fname);
printf("here 3\n");
loader.load_hparams(model.hparams, fin);
printf("here 4\n");
loader.load_hparams(model.layer_config, fin);
printf("here 4.2\n");
loader.load_vocab(model.vocab, fin);
printf("here 4.7\n");
loader.load_model_weights(model, fin);
printf("here 5\n");

// load optional target vocabulary in cases of bilingual models
loader.load_vocab(model.tgt_vocab, fin);
printf("here 6\n");
return 0;
}
100 changes: 44 additions & 56 deletions examples/unity/scripts/ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,14 @@
import ctypes
import pathlib
import importlib.resources
from typing import Callable, List, Optional, Sequence, Union
from pathlib import Path
from typing import List, Optional, Sequence, Union, Callable
from typing_extensions import TypeAlias


# Load the library
def load_shared_library(module_name: str, lib_base_name: str):
def load_shared_library(base_path: Path, lib_base_name: str):
# Construct the paths to the possible shared library names
base_path = pathlib.Path(__file__).parent.resolve()
# Searching for the library in the current directory under the name "libggml" (default name
# for ggml) and "ggml" (default name for this repo)
lib_names: List[str] = [
Expand All @@ -71,41 +71,30 @@ def load_shared_library(module_name: str, lib_base_name: str):
f"{lib_base_name}.dll",
]

path: Optional[pathlib.Path] = None

for lib_name in lib_names:
try:
print(module_name)
print(lib_name)
with importlib.resources.path(module_name, lib_name) as p:
if os.path.exists(p):
path = p
break
except FileNotFoundError:
pass

if path is None:
raise FileNotFoundError(
f"Shared library with base name '{lib_base_name}' not found"
)

cdll_args = dict() # type: ignore
# Add the library directory to the DLL search path on Windows (if needed)
if sys.platform == "win32" and sys.version_info >= (3, 8):
os.add_dll_directory(str(base_path))
cdll_args["winmode"] = 0

# Try to load the shared library, handling potential errors
try:
return ctypes.CDLL(str(path), **cdll_args)
except Exception as e:
raise RuntimeError(f"Failed to load shared library '{path}': {e}")
for lib_name in lib_names:
# Try to load the shared library, handling potential errors
path = base_path / lib_name
if not path.exists():
continue
try:
return ctypes.CDLL(str(path), **cdll_args)
except Exception as e:
raise RuntimeError(f"Failed to load shared library '{path}': {e}")

raise FileNotFoundError(
f"Shared library with base name '{lib_base_name}' not found in {base_path}"
)

module_name = "ggml"
lib_base_name = "ggml"
lib = load_shared_library(module_name, lib_base_name)

base_path = pathlib.Path(__file__).parent.resolve() / "../../../build/examples/unity"
lib_base_name = "fairseq2_cpp"
lib = load_shared_library(base_path, lib_base_name)

#####################################################
# GGML Utility Types
Expand Down Expand Up @@ -231,7 +220,6 @@ def ggml_fp32_to_fp16_row(
]
lib.ggml_fp32_to_fp16_row.restype = None


# struct ggml_context;
ggml_context_p = ctypes.c_void_p
"""Opaque pointer to a ggml_context.
Expand Down Expand Up @@ -7330,29 +7318,29 @@ def ggml_quantize_chunk(

# // These are needed for IQ2_XS and IQ2_XXS quantizations
# GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
def ggml_init_iq2_quantization(
type: Union[ctypes.c_int, int],
):
return lib.ggml_init_iq2_quantization(type)
# def ggml_init_iq2_quantization(
# type: Union[ctypes.c_int, int],
# ):
# return lib.ggml_init_iq2_quantization(type)


lib.ggml_init_iq2_quantization.argtypes = [
ctypes.c_int,
]
lib.ggml_init_iq2_quantization.restype = None
# lib.ggml_init_iq2_quantization.argtypes = [
# ctypes.c_int,
# ]
# lib.ggml_init_iq2_quantization.restype = None


# GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
def ggml_deinit_iq2_quantization(
type: Union[ctypes.c_int, int],
):
return lib.ggml_deinit_iq2_quantization(type)
# def ggml_deinit_iq2_quantization(
# type: Union[ctypes.c_int, int],
# ):
# return lib.ggml_deinit_iq2_quantization(type)


lib.ggml_deinit_iq2_quantization.argtypes = [
ctypes.c_int,
]
lib.ggml_deinit_iq2_quantization.restype = None
# lib.ggml_deinit_iq2_quantization.argtypes = [
# ctypes.c_int,
# ]
# lib.ggml_deinit_iq2_quantization.restype = None

# //
# // Importance matrix
Expand All @@ -7364,18 +7352,18 @@ def ggml_deinit_iq2_quantization(


# GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
def ggml_set_imatrix_collection(
imatrix_collect: Callable[
[ggml_tensor_p, ggml_tensor_p], None
] # TODO: Fix type signature here
):
return lib.ggml_set_imatrix_collection(imatrix_collect)
# def ggml_set_imatrix_collection(
# imatrix_collect: Callable[
# [ggml_tensor_p, ggml_tensor_p], None
# ] # TODO: Fix type signature here
# ):
# return lib.ggml_set_imatrix_collection(imatrix_collect)


lib.ggml_set_imatrix_collection.argtypes = [
ggml_collect_imatrix_t,
]
lib.ggml_set_imatrix_collection.restype = None
# lib.ggml_set_imatrix_collection.argtypes = [
# ggml_collect_imatrix_t,
# ]
# lib.ggml_set_imatrix_collection.restype = None

# //
# // gguf
Expand Down
21 changes: 10 additions & 11 deletions examples/unity/unity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,22 +99,21 @@ int main(int argc, char ** argv) {

// The ctx_size_mb mostly depends of input length and model dim.
int ctx_size_mb = params.opts.mem_mb;
auto encoder_buf = std::vector<uint8_t>(8 * 1024 * 1024); // Only tensor metadata goes in there
auto encoder_fwd_buf = std::vector<uint8_t>(ctx_size_mb * 1024 * 1024 / 2);

while (true) {
// S2ST
if (!params.text) {
std::string input;
std::cout << "\nEnter audio_path and tgt_lang, separated by space (or 'exit' to quit):\n";
std::getline(std::cin, input);
if (input == "exit") {
break;
}
std::istringstream iss(input);
std::string audio_path;
std::string tgt_lang;
iss >> audio_path >> tgt_lang;
// std::string input;
// std::cout << "\nEnter audio_path and tgt_lang, separated by space (or 'exit' to quit):\n";
// std::getline(std::cin, input);
// if (input == "exit") {
// break;
// }
// std::istringstream iss(input);
std::string audio_path = "/private/home/yilinyang/tmp/LJ037-0171_sr16k.wav";
std::string tgt_lang = "eng";
// iss >> audio_path >> tgt_lang;
if (audio_path == "-") {
audio_path = "/proc/self/fd/0";
}
Expand Down
4 changes: 4 additions & 0 deletions include/ggml/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,10 @@ extern "C" {
GGML_OP_DEPTHWISE_CONV_STAGE_0, // internal
GGML_OP_DEPTHWISE_CONV_STAGE_1, // internal
GGML_OP_DEPTHWISE_CONV_STAGE_2, // internal

GGML_OP_CONV_1D_GENERIC_STAGE_0,
GGML_OP_CONV_1D_GENERIC_STAGE_1,

GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_ARGSORT,
Expand Down
Loading