simonw · simonw · Sep 12, 2023 · Sep 9, 2023 · Sep 10, 2023 · Sep 10, 2023
diff --git a/docs/embeddings/python-api.md b/docs/embeddings/python-api.md
@@ -10,8 +10,13 @@ embedding_model = llm.get_embedding_model("ada-002")
 To embed a string, returning a Python list of floating point numbers, use the `.embed()` method:
 ```python
 vector = embedding_model.embed("my happy hound")
+
+If the embedding model can handle binary input, you can call `.embed()` with a byte string instead. You can check the `supports_binary` property to see if this is supported:
+```python
+if embedding_model.supports_binary:
+ vector = embedding_model.embed(open("my-image.jpg", "rb").read())
 ```
-Many embeddings models are more efficient when you embed multiple strings at once. To embed multiple strings at once, use the `.embed_multi()` method:
+Many embeddings models are more efficient when you embed multiple strings or binary strings at once. To embed multiple strings at once, use the `.embed_multi()` method:
 ```python
 vectors = list(embedding_model.embed_multi(["my happy hound", "my dissatisfied cat"]))
 ```
@@ -63,7 +68,7 @@ This additional metadata will be stored as JSON in the `metadata` column of the
 (embeddings-python-bulk)=
 ### Storing embeddings in bulk
 
-The `collection.embed_multi()` method can be used to store embeddings for multiple strings at once. This can be more efficient for some embedding models.
+The `collection.embed_multi()` method can be used to store embeddings for multiple items at once. This can be more efficient for some embedding models.
 
 ```python
 collection.embed_multi(
@@ -177,6 +182,7 @@ CREATE TABLE "embeddings" (
  [id] TEXT,
  [embedding] BLOB,
  [content] TEXT,
+ [content_blob] BLOB,
  [content_hash] BLOB,
  [metadata] TEXT,
  [updated] INTEGER,

diff --git a/docs/help.md b/docs/help.md
@@ -476,11 +476,12 @@ Usage: llm embed [OPTIONS] [COLLECTION] [ID]
  Embed text and store or return the result
 
 Options:
- -i, --input FILENAME File to embed
+ -i, --input PATH  File to embed
  -m, --model TEXT Embedding model to use
  --store Store the text itself in the database
  -d, --database FILE
  -c, --content TEXT Content to embed
+ --binary Treat input as binary data
  --metadata TEXT JSON object metadata to store
  -f, --format [json|blob|base64|hex]
  Output format
@@ -511,6 +512,7 @@ Options:
  --files <DIRECTORY TEXT>... Embed files in this directory - specify directory
  and glob pattern
  --encoding TEXT Encoding to use when reading --files
+ --binary Treat --files as binary data
  --sql TEXT Read input using this SQL query
  --attach <TEXT FILE>... Additional databases to attach - specify alias
  and file path

diff --git a/llm/cli.py b/llm/cli.py
@@ -34,7 +34,7 @@
 from sqlite_utils.utils import rows_from_file, Format
 import sys
 import textwrap
-from typing import cast, Optional
+from typing import cast, Optional, Iterable, Union, Tuple
 import warnings
 import yaml
 
@@ -1025,7 +1025,7 @@ def uninstall(packages, yes):
 @click.option(
  "-i",
  "--input",
- type=click.File("r"),
+ type=click.Path(exists=True, readable=True, allow_dash=True),
  help="File to embed",
 )
 @click.option("-m", "--model", help="Embedding model to use")
@@ -1041,6 +1041,7 @@ def uninstall(packages, yes):
  "--content",
  help="Content to embed",
 )
+@click.option("--binary", is_flag=True, help="Treat input as binary data")
 @click.option(
  "--metadata",
  help="JSON object metadata to store",
@@ -1053,7 +1054,9 @@ def uninstall(packages, yes):
  type=click.Choice(["json", "blob", "base64", "hex"]),
  help="Output format",
 )
-def embed(collection, id, input, model, store, database, content, metadata, format_):
+def embed(
+ collection, id, input, model, store, database, content, binary, metadata, format_
+):
  """Embed text and store or return the result"""
  if collection and not id:
  raise click.ClickException("Must provide both collection and id")
@@ -1101,10 +1104,15 @@ def get_db():
 
  # Resolve input text
  if not content:
- if not input:
+ if not input or input == "-":
  # Read from stdin
- input = sys.stdin
- content = input.read()
+ input_source = sys.stdin.buffer if binary else sys.stdin
+ content = input_source.read()
+ else:
+ mode = "rb" if binary else "r"
+ with open(input, mode) as f:
+ content = f.read()
+
  if not content:
  raise click.ClickException("No content provided")
 
@@ -1148,6 +1156,7 @@ def get_db():
  help="Encoding to use when reading --files",
  multiple=True,
 )
+@click.option("--binary", is_flag=True, help="Treat --files as binary data")
 @click.option("--sql", help="Read input using this SQL query")
 @click.option(
  "--attach",
@@ -1170,6 +1179,7 @@ def embed_multi(
  format,
  files,
  encodings,
+ binary,
  sql,
  attach,
  prefix,
@@ -1193,6 +1203,10 @@ def embed_multi(
  2. A SQL query against a SQLite database
  3. A directory of files
  """
+ if binary and not files:
+ raise click.UsageError("--binary must be used with --files")
+ if binary and encodings:
+ raise click.UsageError("--binary cannot be used with --encoding")
  if not input_path and not sql and not files:
  raise click.UsageError("Either --sql or input path or --files is required")
 
@@ -1235,11 +1249,14 @@ def iterate_files():
  for path in pathlib.Path(directory).glob(pattern):
  relative = path.relative_to(directory)
  content = None
- for encoding in encodings:
- try:
- content = path.read_text(encoding=encoding)
- except UnicodeDecodeError:
- continue
+ if binary:
+ content = path.read_bytes()
+ else:
+ for encoding in encodings:
+ try:
+ content = path.read_text(encoding=encoding)
+ except UnicodeDecodeError:
+ continue
  if content is None:
  # Log to stderr
  click.echo(
@@ -1280,12 +1297,14 @@ def load_rows(fp):
  rows, label="Embedding", show_percent=True, length=expected_length
  ) as rows:
 
- def tuples():
+ def tuples() -> Iterable[Tuple[str, Union[bytes, str]]]:
  for row in rows:
  values = list(row.values())
  id = prefix + str(values[0])
- text = " ".join(v or "" for v in values[1:])
- yield id, text
+ if binary:
+ yield id, cast(bytes, values[1])
+ else:
+ yield id, " ".join(v or "" for v in values[1:])
 
  # collection_obj.max_batch_size = 1
  collection_obj.embed_multi(tuples(), store=store)

diff --git a/llm/default_plugins/openai_models.py b/llm/default_plugins/openai_models.py
@@ -67,9 +67,9 @@ class Ada002(EmbeddingModel):
  key_env_var = "OPENAI_API_KEY"
  batch_size = 100 # Maybe this should be 2048
 
- def embed_batch(self, texts: Iterable[str]) -> Iterator[List[float]]:
+ def embed_batch(self, items: Iterable[Union[str, bytes]]) -> Iterator[List[float]]:
  results = openai.Embedding.create(
- input=texts, model="text-embedding-ada-002", api_key=self.get_key()
+ input=items, model="text-embedding-ada-002", api_key=self.get_key()
  )["data"]
  return ([float(r) for r in result["embedding"]] for result in results)
 

diff --git a/llm/embeddings.py b/llm/embeddings.py
@@ -7,7 +7,7 @@
 from sqlite_utils import Database
 from sqlite_utils.db import Table
 import time
-from typing import cast, Any, Dict, Iterable, List, Optional, Tuple
+from typing import cast, Any, Dict, Iterable, List, Optional, Tuple, Union
 
 
 @dataclass
@@ -117,33 +117,34 @@ def count(self) -> int:
  def embed(
  self,
  id: str,
- text: str,
+ value: Union[str, bytes],
  metadata: Optional[Dict[str, Any]] = None,
  store: bool = False,
  ) -> None:
  """
- Embed text and store it in the collection with a given ID.
+ Embed value and store it in the collection with a given ID.
 
  Args:
- id (str): ID for the text
- text (str): Text to be embedded
+ id (str): ID for the value
+ value (str or bytes): value to be embedded
  metadata (dict, optional): Metadata to be stored
- store (bool, optional): Whether to store the text in the content column
+ store (bool, optional): Whether to store the value in the content or content_blob column
  """
  from llm import encode
 
- content_hash = self.content_hash(text)
+ content_hash = self.content_hash(value)
  if self.db["embeddings"].count_where(
  "content_hash = ? and collection_id = ?", [content_hash, self.id]
  ):
  return
- embedding = self.model().embed(text)
+ embedding = self.model().embed(value)
  cast(Table, self.db["embeddings"]).insert(
  {
  "collection_id": self.id,
  "id": id,
  "embedding": encode(embedding),
- "content": text if store else None,
+ "content": value if (store and isinstance(value, str)) else None,
+ "content_blob": value if (store and isinstance(value, bytes)) else None,
  "content_hash": content_hash,
  "metadata": json.dumps(metadata) if metadata else None,
  "updated": int(time.time()),
@@ -152,7 +153,7 @@ def embed(
  )
 
  def embed_multi(
- self, entries: Iterable[Tuple[str, str]], store: bool = False
+ self, entries: Iterable[Tuple[str, Union[str, bytes]]], store: bool = False
  ) -> None:
  """
  Embed multiple texts and store them in the collection with given IDs.
@@ -162,20 +163,20 @@ def embed_multi(
  store (bool, optional): Whether to store the text in the content column
  """
  self.embed_multi_with_metadata(
- ((id, text, None) for id, text in entries), store=store
+ ((id, value, None) for id, value in entries), store=store
  )
 
  def embed_multi_with_metadata(
  self,
- entries: Iterable[Tuple[str, str, Optional[Dict[str, Any]]]],
+ entries: Iterable[Tuple[str, Union[str, bytes], Optional[Dict[str, Any]]]],
  store: bool = False,
  ) -> None:
  """
- Embed multiple texts along with metadata and store them in the collection with given IDs.
+ Embed multiple values along with metadata and store them in the collection with given IDs.
 
  Args:
- entries (iterable): Iterable of (id: str, text: str, metadata: None or dict)
- store (bool, optional): Whether to store the text in the content column
+ entries (iterable): Iterable of (id: str, value: str or bytes, metadata: None or dict)
+ store (bool, optional): Whether to store the value in the content or content_blob column
  """
  import llm
 
@@ -215,12 +216,17 @@ def embed_multi_with_metadata(
  "collection_id": collection_id,
  "id": id,
  "embedding": llm.encode(embedding),
- "content": text if store else None,
- "content_hash": self.content_hash(text),
+ "content": value
+ if (store and isinstance(value, str))
+ else None,
+ "content_blob": value
+ if (store and isinstance(value, bytes))
+ else None,
+ "content_hash": self.content_hash(value),
  "metadata": json.dumps(metadata) if metadata else None,
  "updated": int(time.time()),
  }
- for (embedding, (id, text, metadata)) in zip(
+ for (embedding, (id, value, metadata)) in zip(
  embeddings, filtered_batch
  )
  ),
@@ -300,18 +306,18 @@ def similar_by_id(self, id: str, number: int = 10) -> List[Entry]:
  comparison_vector = llm.decode(embedding)
  return self.similar_by_vector(comparison_vector, number, skip_id=id)
 
- def similar(self, text: str, number: int = 10) -> List[Entry]:
+ def similar(self, value: Union[str, bytes], number: int = 10) -> List[Entry]:
  """
- Find similar items in the collection by a given text.
+ Find similar items in the collection by a given value.
 
  Args:
- text (str): Text to search by
+ value (str or bytes): value to search by
  number (int, optional): Number of similar items to return
 
  Returns:
  list: List of Entry objects
  """
- comparison_vector = self.model().embed(text)
+ comparison_vector = self.model().embed(value)
  return self.similar_by_vector(comparison_vector, number)
 
  @classmethod
@@ -334,6 +340,8 @@ def delete(self):
  self.db.execute("delete from collections where id = ?", [self.id])
 
  @staticmethod
- def content_hash(text: str) -> bytes:
+ def content_hash(input: Union[str, bytes]) -> bytes:
  "Hash content for deduplication. Override to change hashing behavior."
- return hashlib.md5(text.encode("utf8")).digest()
+ if isinstance(input, str):
+ input = input.encode("utf8")
+ return hashlib.md5(input).digest()
diff --git a/llm/embeddings_migrations.py b/llm/embeddings_migrations.py
@@ -83,3 +83,11 @@ def random_md5():
  # De-register functions
  db.conn.create_function("temp_md5", 1, None)
  db.conn.create_function("temp_random_md5", 0, None)
+
+
+@embeddings_migrations()
+def m005_add_content_blob(db):
+ db["embeddings"].add_column("content_blob", bytes)
+ db["embeddings"].transform(
+ column_order=("collection_id", "id", "embedding", "content", "content_blob")
+ )