Skip to content

Commit

Permalink
refactor!: rename all remaining metadata to meta (#6650)
Browse files Browse the repository at this point in the history
* change metadata to meta

* release note
  • Loading branch information
anakin87 committed Dec 28, 2023
1 parent c254e5e commit c773c30
Show file tree
Hide file tree
Showing 15 changed files with 95 additions and 102 deletions.
2 changes: 1 addition & 1 deletion examples/retrievers/in_memory_bm25_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
rag_pipeline.connect("retriever", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")
rag_pipeline.connect("llm.replies", "answer_builder.replies")
rag_pipeline.connect("llm.metadata", "answer_builder.metadata")
rag_pipeline.connect("llm.meta", "answer_builder.meta")
rag_pipeline.connect("retriever", "answer_builder.documents")

# Draw the pipeline
Expand Down
4 changes: 2 additions & 2 deletions haystack/components/builders/dynamic_chat_prompt_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class DynamicChatPromptBuilder:
>> {'llm': {'replies': [ChatMessage(content="Berlin is the capital city of Germany and one of the most vibrant
and diverse cities in Europe. Here are some key things to know...Enjoy your time exploring the vibrant and dynamic
capital of Germany!", role=<ChatRole.ASSISTANT: 'assistant'>, name=None, metadata={'model': 'gpt-3.5-turbo-0613',
capital of Germany!", role=<ChatRole.ASSISTANT: 'assistant'>, name=None, meta={'model': 'gpt-3.5-turbo-0613',
'index': 0, 'finish_reason': 'stop', 'usage': {'prompt_tokens': 27, 'completion_tokens': 681, 'total_tokens':
708}})]}}
Expand All @@ -63,7 +63,7 @@ class DynamicChatPromptBuilder:
print(res)
>> {'llm': {'replies': [ChatMessage(content="Here is the weather forecast for Berlin in the next 5
days:\n\nDay 1: Mostly cloudy with a high of 22°C (72°F) and...so it's always a good idea to check for updates
closer to your visit.", role=<ChatRole.ASSISTANT: 'assistant'>, name=None, metadata={'model': 'gpt-3.5-turbo-0613',
closer to your visit.", role=<ChatRole.ASSISTANT: 'assistant'>, name=None, meta={'model': 'gpt-3.5-turbo-0613',
'index': 0, 'finish_reason': 'stop', 'usage': {'prompt_tokens': 37, 'completion_tokens': 201,
'total_tokens': 238}})]}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def __init__(
suffix: str = "",
batch_size: int = 32,
progress_bar: bool = True,
metadata_fields_to_embed: Optional[List[str]] = None,
meta_fields_to_embed: Optional[List[str]] = None,
embedding_separator: str = "\n",
):
"""
Expand All @@ -105,7 +105,7 @@ def __init__(
:param batch_size: Number of Documents to encode at once.
:param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments
to keep the logs clean.
:param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document text.
:param meta_fields_to_embed: List of meta fields that should be embedded along with the Document text.
:param embedding_separator: Separator used to concatenate the meta fields to the Document text.
"""
transformers_import.check()
Expand All @@ -129,7 +129,7 @@ def __init__(
self.suffix = suffix
self.batch_size = batch_size
self.progress_bar = progress_bar
self.metadata_fields_to_embed = metadata_fields_to_embed or []
self.meta_fields_to_embed = meta_fields_to_embed or []
self.embedding_separator = embedding_separator

def to_dict(self) -> Dict[str, Any]:
Expand All @@ -145,7 +145,7 @@ def to_dict(self) -> Dict[str, Any]:
suffix=self.suffix,
batch_size=self.batch_size,
progress_bar=self.progress_bar,
metadata_fields_to_embed=self.metadata_fields_to_embed,
meta_fields_to_embed=self.meta_fields_to_embed,
embedding_separator=self.embedding_separator,
)

Expand All @@ -163,9 +163,7 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]:
texts_to_embed = []
for doc in documents:
meta_values_to_embed = [
str(doc.meta[key])
for key in self.metadata_fields_to_embed
if key in doc.meta and doc.meta[key] is not None
str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None
]

text_to_embed = (
Expand Down
34 changes: 16 additions & 18 deletions haystack/components/embedders/openai_document_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(
suffix: str = "",
batch_size: int = 32,
progress_bar: bool = True,
metadata_fields_to_embed: Optional[List[str]] = None,
meta_fields_to_embed: Optional[List[str]] = None,
embedding_separator: str = "\n",
):
"""
Expand All @@ -54,7 +54,7 @@ def __init__(
:param batch_size: Number of Documents to encode at once.
:param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments
to keep the logs clean.
:param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document text.
:param meta_fields_to_embed: List of meta fields that should be embedded along with the Document text.
:param embedding_separator: Separator used to concatenate the meta fields to the Document text.
"""
self.model_name = model_name
Expand All @@ -64,7 +64,7 @@ def __init__(
self.suffix = suffix
self.batch_size = batch_size
self.progress_bar = progress_bar
self.metadata_fields_to_embed = metadata_fields_to_embed or []
self.meta_fields_to_embed = meta_fields_to_embed or []
self.embedding_separator = embedding_separator

self.client = OpenAI(api_key=api_key, organization=organization, base_url=api_base_url)
Expand All @@ -89,7 +89,7 @@ def to_dict(self) -> Dict[str, Any]:
suffix=self.suffix,
batch_size=self.batch_size,
progress_bar=self.progress_bar,
metadata_fields_to_embed=self.metadata_fields_to_embed,
meta_fields_to_embed=self.meta_fields_to_embed,
embedding_separator=self.embedding_separator,
)

Expand All @@ -100,9 +100,7 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]:
texts_to_embed = []
for doc in documents:
meta_values_to_embed = [
str(doc.meta[key])
for key in self.metadata_fields_to_embed
if key in doc.meta and doc.meta[key] is not None
str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None
]

text_to_embed = (
Expand All @@ -121,7 +119,7 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List
"""

all_embeddings = []
metadata: Dict[str, Any] = {}
meta: Dict[str, Any] = {}
for i in tqdm(
range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings"
):
Expand All @@ -130,17 +128,17 @@ def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List
embeddings = [el.embedding for el in response.data]
all_embeddings.extend(embeddings)

if "model" not in metadata:
metadata["model"] = response.model
if "usage" not in metadata:
metadata["usage"] = dict(response.usage)
if "model" not in meta:
meta["model"] = response.model
if "usage" not in meta:
meta["usage"] = dict(response.usage)
else:
metadata["usage"]["prompt_tokens"] += response.usage.prompt_tokens
metadata["usage"]["total_tokens"] += response.usage.total_tokens
meta["usage"]["prompt_tokens"] += response.usage.prompt_tokens
meta["usage"]["total_tokens"] += response.usage.total_tokens

return all_embeddings, metadata
return all_embeddings, meta

@component.output_types(documents=List[Document], metadata=Dict[str, Any])
@component.output_types(documents=List[Document], meta=Dict[str, Any])
def run(self, documents: List[Document]):
"""
Embed a list of Documents.
Expand All @@ -156,9 +154,9 @@ def run(self, documents: List[Document]):

texts_to_embed = self._prepare_texts_to_embed(documents=documents)

embeddings, metadata = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size)
embeddings, meta = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size)

for doc, emb in zip(documents, embeddings):
doc.embedding = emb

return {"documents": documents, "metadata": metadata}
return {"documents": documents, "meta": meta}
10 changes: 5 additions & 5 deletions haystack/components/embedders/openai_text_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ class OpenAITextEmbedder:
print(text_embedder.run(text_to_embed))
# {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
# 'metadata': {'model': 'text-embedding-ada-002-v2',
# 'usage': {'prompt_tokens': 4, 'total_tokens': 4}}}
# 'meta': {'model': 'text-embedding-ada-002-v2',
# 'usage': {'prompt_tokens': 4, 'total_tokens': 4}}}
```
"""

Expand Down Expand Up @@ -71,7 +71,7 @@ def to_dict(self) -> Dict[str, Any]:
self, model_name=self.model_name, organization=self.organization, prefix=self.prefix, suffix=self.suffix
)

@component.output_types(embedding=List[float], metadata=Dict[str, Any])
@component.output_types(embedding=List[float], meta=Dict[str, Any])
def run(self, text: str):
"""Embed a string."""
if not isinstance(text, str):
Expand All @@ -87,6 +87,6 @@ def run(self, text: str):
text_to_embed = text_to_embed.replace("\n", " ")

response = self.client.embeddings.create(model=self.model_name, input=text_to_embed)
metadata = {"model": response.model, "usage": dict(response.usage)}
meta = {"model": response.model, "usage": dict(response.usage)}

return {"embedding": response.data[0].embedding, "metadata": metadata}
return {"embedding": response.data[0].embedding, "meta": meta}
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(
batch_size: int = 32,
progress_bar: bool = True,
normalize_embeddings: bool = False,
metadata_fields_to_embed: Optional[List[str]] = None,
meta_fields_to_embed: Optional[List[str]] = None,
embedding_separator: str = "\n",
):
"""
Expand All @@ -57,7 +57,7 @@ def __init__(
:param batch_size: Number of strings to encode at once.
:param progress_bar: If true, displays progress bar during embedding.
:param normalize_embeddings: If set to true, returned vectors will have length 1.
:param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document content.
:param meta_fields_to_embed: List of meta fields that should be embedded along with the Document content.
:param embedding_separator: Separator used to concatenate the meta fields to the Document content.
"""

Expand All @@ -70,7 +70,7 @@ def __init__(
self.batch_size = batch_size
self.progress_bar = progress_bar
self.normalize_embeddings = normalize_embeddings
self.metadata_fields_to_embed = metadata_fields_to_embed or []
self.meta_fields_to_embed = meta_fields_to_embed or []
self.embedding_separator = embedding_separator

def _get_telemetry_data(self) -> Dict[str, Any]:
Expand All @@ -93,7 +93,7 @@ def to_dict(self) -> Dict[str, Any]:
batch_size=self.batch_size,
progress_bar=self.progress_bar,
normalize_embeddings=self.normalize_embeddings,
metadata_fields_to_embed=self.metadata_fields_to_embed,
meta_fields_to_embed=self.meta_fields_to_embed,
embedding_separator=self.embedding_separator,
)

Expand Down Expand Up @@ -125,7 +125,7 @@ def run(self, documents: List[Document]):
texts_to_embed = []
for doc in documents:
meta_values_to_embed = [
str(doc.meta[key]) for key in self.metadata_fields_to_embed if key in doc.meta and doc.meta[key]
str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key]
]
text_to_embed = (
self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) + self.suffix
Expand Down
28 changes: 12 additions & 16 deletions haystack/components/rankers/meta_field.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ class MetaFieldRanker:
from haystack import Document
from haystack.components.rankers import MetaFieldRanker
ranker = MetaFieldRanker(metadata_field="rating")
ranker = MetaFieldRanker(meta_field="rating")
docs = [
Document(text="Paris", metadata={"rating": 1.3}),
Document(text="Berlin", metadata={"rating": 0.7}),
Document(text="Barcelona", metadata={"rating": 2.1}),
Document(text="Paris", meta={"rating": 1.3}),
Document(text="Berlin", meta={"rating": 0.7}),
Document(text="Barcelona", meta={"rating": 2.1}),
]
output = ranker.run(documents=docs)
Expand All @@ -32,15 +32,15 @@ class MetaFieldRanker:

def __init__(
self,
metadata_field: str,
meta_field: str,
weight: float = 1.0,
top_k: Optional[int] = None,
ranking_mode: Literal["reciprocal_rank_fusion", "linear_score"] = "reciprocal_rank_fusion",
):
"""
Creates an instance of MetaFieldRanker.
:param metadata_field: The name of the metadata field to rank by.
:param meta_field: The name of the metadata field to rank by.
:param weight: In range [0,1].
0 disables ranking by a metadata field.
0.5 content and metadata fields have the same impact for the ranking.
Expand All @@ -51,7 +51,7 @@ def __init__(
Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1].
"""

self.metadata_field = metadata_field
self.meta_field = meta_field
self.weight = weight
self.top_k = top_k
self.ranking_mode = ranking_mode
Expand Down Expand Up @@ -82,11 +82,7 @@ def to_dict(self) -> Dict[str, Any]:
Serialize object to a dictionary.
"""
return default_to_dict(
self,
metadata_field=self.metadata_field,
weight=self.weight,
top_k=self.top_k,
ranking_mode=self.ranking_mode,
self, meta_field=self.meta_field, weight=self.weight, top_k=self.top_k, ranking_mode=self.ranking_mode
)

@component.output_types(documents=List[Document])
Expand All @@ -109,15 +105,15 @@ def run(self, documents: List[Document], top_k: Optional[int] = None):
raise ValueError(f"top_k must be > 0, but got {top_k}")

try:
sorted_by_metadata = sorted(documents, key=lambda doc: doc.meta[self.metadata_field], reverse=True)
sorted_by_metadata = sorted(documents, key=lambda doc: doc.meta[self.meta_field], reverse=True)
except KeyError:
raise ComponentError(
"""
The parameter <metadata_field> is currently set to '{}' but the Documents {} don't have this metadata key.\n
The parameter <meta_field> is currently set to '{}' but the Documents {} don't have this metadata key.\n
Double-check the names of the metadata fields in your documents \n
and set <metadata_field> to the name of the field that contains the metadata you want to use for ranking.
and set <meta_field> to the name of the field that contains the metadata you want to use for ranking.
""".format(
self.metadata_field, ",".join([doc.id for doc in documents if self.metadata_field not in doc.meta])
self.meta_field, ",".join([doc.id for doc in documents if self.meta_field not in doc.meta])
)
)

Expand Down
2 changes: 1 addition & 1 deletion haystack/pipeline_utils/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def download_files(sources: List[str]) -> List[str]:

all_files = []
for stream in streams["streams"]:
file_suffix = ".html" if stream.metadata["content_type"] == "text/html" else ".pdf"
file_suffix = ".html" if stream.meta["content_type"] == "text/html" else ".pdf"
f = NamedTemporaryFile(delete=False, suffix=file_suffix)
stream.to_file(Path(f.name))
all_files.append(f.name)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
enhancements:
- |
Rename `metadata` to `meta`.
Rename `metadata_fields_to_embed` to `meta_fields_to_embed` in all Embedders.
Rename `metadata_field` to `meta_field` in `MetaFieldRanker`.
Loading

0 comments on commit c773c30

Please sign in to comment.