Expose generation timings from server & update completions.js (ggerga…

…nov#2116) * use javascript generators as much cleaner API Also add ways to access completion as promise and EventSource * export llama_timings as struct and expose them in server * update readme, update baked includes * llama : uniform variable names + struct init --------- Co-authored-by: Georgi Gerganov <[email protected]>
YellowRoseCx · Jul 5, 2023 · 31cfbb1 · 31cfbb1
1 parent 983b555
commit 31cfbb1
Show file tree

Hide file tree

Showing 9 changed files with 1,926 additions and 1,368 deletions.
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -26,20 +26,17 @@ Command line options:
 
 ## Build
 
-Build llama.cpp with server from repository root with either make or CMake.
+server is build alongside everything else from the root of the project
 
 - Using `make`:
 
  ```bash
- LLAMA_BUILD_SERVER=1 make
+ make
  ```
 
 - Using `CMake`:
 
  ```bash
- mkdir build-server
- cd build-server
- cmake -DLLAMA_BUILD_SERVER=ON ..
  cmake --build . --config Release
  ```
 
@@ -208,24 +205,30 @@ openai.api_base = "http:https://<Your api-server IP>:port"
 
 Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
 
-### Extending the Web Front End
+### Extending or building alternative Web Front End
 
-The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method. A simple example is below:
+The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
 
-```
+Read the documentation in `/completion.js` to see convenient ways to access llama.
+
+A simple example is below:
+
+```html
 <html>
  <body>
  <pre>
  <script type="module">
- import { llamaComplete } from '/completion.js'
-
- llamaComplete({
- prompt: "### Instruction:\nWrite dad jokes, each one paragraph. You can use html formatting if needed.\n\n### Response:",
- n_predict: 1024,
- },
- null,
- (chunk) => document.write(chunk.data.content)
- )
+ import { llama } from '/completion.js'
+
+ const prompt = `### Instruction:
+Write dad jokes, each one paragraph.
+You can use html formatting if needed.
+
+### Response:`
+
+ for await (const chunk of llama(prompt)) {
+ document.write(chunk.data.content)
+ }
  </script>
  </pre>
  </body>

diff --git a/examples/server/completion.js.hpp b/examples/server/completion.js.hpp
diff --git a/examples/server/deps.sh b/examples/server/deps.sh
@@ -4,10 +4,6 @@
 # get the directory of this script file
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 PUBLIC=$DIR/public
-OUTPUT=$DIR/templats.hpp
-
-echo "// Generated file, do not edit" > $OUTPUT
-echo "" > $OUTPUT
 
 echo "download js bundle files"
 curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js

diff --git a/examples/server/index.html.hpp b/examples/server/index.html.hpp
diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js
@@ -5,20 +5,29 @@ const paramDefaults = {
  stop: ["</s>"]
 };
 
-/**
- * This function completes the input text using a llama dictionary.
- * @param {object} params - The parameters for the completion request.
- * @param {object} controller - an instance of AbortController if you need one, or null.
- * @param {function} callback - The callback function to call when the completion is done.
- * @returns {string} the completed text as a string. Ideally ignored, and you get at it via the callback.
- */
-export const llamaComplete = async (params, controller, callback) => {
+let generation_settings = null;
+
+
+// Completes the prompt as a generator. Recommended for most use cases.
+//
+// Example:
+//
+// import { llama } from '/completion.js'
+//
+// const request = llama("Tell me a joke", {n_predict: 800})
+// for await (const chunk of request) {
+// document.write(chunk.data.content)
+// }
+//
+export async function* llama(prompt, params = {}, config = {}) {
+ let controller = config.controller;
+
  if (!controller) {
  controller = new AbortController();
  }
- const completionParams = { ...paramDefaults, ...params };
 
- // we use fetch directly here becasue the built in fetchEventSource does not support POST
+ const completionParams = { ...paramDefaults, ...params, prompt };
+
  const response = await fetch("/completion", {
  method: 'POST',
  body: JSON.stringify(completionParams),
@@ -36,7 +45,6 @@ export const llamaComplete = async (params, controller, callback) => {
  let content = "";
 
  try {
-
  let cont = true;
 
  while (cont) {
@@ -59,18 +67,21 @@ export const llamaComplete = async (params, controller, callback) => {
  result.data = JSON.parse(result.data);
  content += result.data.content;
 
- // callack
- if (callback) {
- cont = callback(result) != false;
- }
+ // yield
+ yield result;
 
  // if we got a stop token from server, we will break here
  if (result.data.stop) {
+ if (result.data.generation_settings) {
+ generation_settings = result.data.generation_settings;
+ }
  break;
  }
  }
  } catch (e) {
- console.error("llama error: ", e);
+ if (e.name !== 'AbortError') {
+ console.error("llama error: ", e);
+ }
  throw e;
  }
  finally {
@@ -79,3 +90,79 @@ export const llamaComplete = async (params, controller, callback) => {
 
  return content;
 }
+
+// Call llama, return an event target that you can subcribe to
+//
+// Example:
+//
+// import { llamaEventTarget } from '/completion.js'
+//
+// const conn = llamaEventTarget(prompt)
+// conn.addEventListener("message", (chunk) => {
+// document.write(chunk.detail.content)
+// })
+//
+export const llamaEventTarget = (prompt, params = {}, config = {}) => {
+ const eventTarget = new EventTarget();
+ (async () => {
+ let content = "";
+ for await (const chunk of llama(prompt, params, config)) {
+ if (chunk.data) {
+ content += chunk.data.content;
+ eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
+ }
+ if (chunk.data.generation_settings) {
+ eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
+ }
+ if (chunk.data.timings) {
+ eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
+ }
+ }
+ eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
+ })();
+ return eventTarget;
+}
+
+// Call llama, return a promise that resolves to the completed text. This does not support streaming
+//
+// Example:
+//
+// llamaPromise(prompt).then((content) => {
+// document.write(content)
+// })
+//
+// or
+//
+// const content = await llamaPromise(prompt)
+// document.write(content)
+//
+export const llamaPromise = (prompt, params = {}, config = {}) => {
+ return new Promise(async (resolve, reject) => {
+ let content = "";
+ try {
+ for await (const chunk of llama(prompt, params, config)) {
+ content += chunk.data.content;
+ }
+ resolve(content);
+ } catch (error) {
+ reject(error);
+ }
+ });
+};
+
+/**
+ * (deprecated)
+ */
+export const llamaComplete = async (params, controller, callback) => {
+ for await (const chunk of llama(params.prompt, params, { controller })) {
+ callback(chunk);
+ }
+}
+
+// Get the model info from the server. This is useful for getting the context window and so on.
+export const llamaModelInfo = async () => {
+ if (!generation_settings) {
+ generation_settings = await fetch("/model.json").then(r => r.json());
+ }
+ return generation_settings;
+}