Skip to content

Commit

Permalink
Expose generation timings from server & update completions.js (ggerga…
Browse files Browse the repository at this point in the history
…nov#2116)

* use javascript generators as much cleaner API

Also add ways to access completion as promise and EventSource

* export llama_timings as struct and expose them in server

* update readme, update baked includes

* llama : uniform variable names + struct init

---------

Co-authored-by: Georgi Gerganov <[email protected]>
  • Loading branch information
tobi and ggerganov committed Jul 5, 2023
1 parent 983b555 commit 31cfbb1
Show file tree
Hide file tree
Showing 9 changed files with 1,926 additions and 1,368 deletions.
37 changes: 20 additions & 17 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,17 @@ Command line options:

## Build

Build llama.cpp with server from repository root with either make or CMake.
server is build alongside everything else from the root of the project

- Using `make`:

```bash
LLAMA_BUILD_SERVER=1 make
make
```

- Using `CMake`:

```bash
mkdir build-server
cd build-server
cmake -DLLAMA_BUILD_SERVER=ON ..
cmake --build . --config Release
```

Expand Down Expand Up @@ -208,24 +205,30 @@ openai.api_base = "http:https://<Your api-server IP>:port"

Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API

### Extending the Web Front End
### Extending or building alternative Web Front End

The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method. A simple example is below:
The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.

```
Read the documentation in `/completion.js` to see convenient ways to access llama.

A simple example is below:

```html
<html>
<body>
<pre>
<script type="module">
import { llamaComplete } from '/completion.js'
llamaComplete({
prompt: "### Instruction:\nWrite dad jokes, each one paragraph. You can use html formatting if needed.\n\n### Response:",
n_predict: 1024,
},
null,
(chunk) => document.write(chunk.data.content)
)
import { llama } from '/completion.js'
const prompt = `### Instruction:
Write dad jokes, each one paragraph.
You can use html formatting if needed.
### Response:`
for await (const chunk of llama(prompt)) {
document.write(chunk.data.content)
}
</script>
</pre>
</body>
Expand Down
548 changes: 365 additions & 183 deletions examples/server/completion.js.hpp

Large diffs are not rendered by default.

4 changes: 0 additions & 4 deletions examples/server/deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@
# get the directory of this script file
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
PUBLIC=$DIR/public
OUTPUT=$DIR/templats.hpp

echo "// Generated file, do not edit" > $OUTPUT
echo "" > $OUTPUT

echo "download js bundle files"
curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
Expand Down
1,579 changes: 816 additions & 763 deletions examples/server/index.html.hpp

Large diffs are not rendered by default.

119 changes: 103 additions & 16 deletions examples/server/public/completion.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,29 @@ const paramDefaults = {
stop: ["</s>"]
};

/**
* This function completes the input text using a llama dictionary.
* @param {object} params - The parameters for the completion request.
* @param {object} controller - an instance of AbortController if you need one, or null.
* @param {function} callback - The callback function to call when the completion is done.
* @returns {string} the completed text as a string. Ideally ignored, and you get at it via the callback.
*/
export const llamaComplete = async (params, controller, callback) => {
let generation_settings = null;


// Completes the prompt as a generator. Recommended for most use cases.
//
// Example:
//
// import { llama } from '/completion.js'
//
// const request = llama("Tell me a joke", {n_predict: 800})
// for await (const chunk of request) {
// document.write(chunk.data.content)
// }
//
export async function* llama(prompt, params = {}, config = {}) {
let controller = config.controller;

if (!controller) {
controller = new AbortController();
}
const completionParams = { ...paramDefaults, ...params };

// we use fetch directly here becasue the built in fetchEventSource does not support POST
const completionParams = { ...paramDefaults, ...params, prompt };

const response = await fetch("/completion", {
method: 'POST',
body: JSON.stringify(completionParams),
Expand All @@ -36,7 +45,6 @@ export const llamaComplete = async (params, controller, callback) => {
let content = "";

try {

let cont = true;

while (cont) {
Expand All @@ -59,18 +67,21 @@ export const llamaComplete = async (params, controller, callback) => {
result.data = JSON.parse(result.data);
content += result.data.content;

// callack
if (callback) {
cont = callback(result) != false;
}
// yield
yield result;

// if we got a stop token from server, we will break here
if (result.data.stop) {
if (result.data.generation_settings) {
generation_settings = result.data.generation_settings;
}
break;
}
}
} catch (e) {
console.error("llama error: ", e);
if (e.name !== 'AbortError') {
console.error("llama error: ", e);
}
throw e;
}
finally {
Expand All @@ -79,3 +90,79 @@ export const llamaComplete = async (params, controller, callback) => {

return content;
}

// Call llama, return an event target that you can subcribe to
//
// Example:
//
// import { llamaEventTarget } from '/completion.js'
//
// const conn = llamaEventTarget(prompt)
// conn.addEventListener("message", (chunk) => {
// document.write(chunk.detail.content)
// })
//
export const llamaEventTarget = (prompt, params = {}, config = {}) => {
const eventTarget = new EventTarget();
(async () => {
let content = "";
for await (const chunk of llama(prompt, params, config)) {
if (chunk.data) {
content += chunk.data.content;
eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
}
if (chunk.data.generation_settings) {
eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
}
if (chunk.data.timings) {
eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
}
}
eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
})();
return eventTarget;
}

// Call llama, return a promise that resolves to the completed text. This does not support streaming
//
// Example:
//
// llamaPromise(prompt).then((content) => {
// document.write(content)
// })
//
// or
//
// const content = await llamaPromise(prompt)
// document.write(content)
//
export const llamaPromise = (prompt, params = {}, config = {}) => {
return new Promise(async (resolve, reject) => {
let content = "";
try {
for await (const chunk of llama(prompt, params, config)) {
content += chunk.data.content;
}
resolve(content);
} catch (error) {
reject(error);
}
});
};

/**
* (deprecated)
*/
export const llamaComplete = async (params, controller, callback) => {
for await (const chunk of llama(params.prompt, params, { controller })) {
callback(chunk);
}
}

// Get the model info from the server. This is useful for getting the context window and so on.
export const llamaModelInfo = async () => {
if (!generation_settings) {
generation_settings = await fetch("/model.json").then(r => r.json());
}
return generation_settings;
}
Loading

0 comments on commit 31cfbb1

Please sign in to comment.