From 58c4d7e4e2a2db4af8a9555d1bd9148eaafdc239 Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Date: Thu, 25 Apr 2024 18:18:03 +0200 Subject: [PATCH 01/14] Add the .history folder to the .gitignore file. --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 063a0e7661..3d5086f12e 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,5 @@ haystack/json-schemas # ruff .ruff_cache + +.history From 9c1c153aa627fc296fea8c6cb069c549056072c2 Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Date: Thu, 25 Apr 2024 18:38:56 +0200 Subject: [PATCH 02/14] Add the implementation for page counting used in the v1.25.x branch. It should work as expected in issue #6705. --- .../preprocessors/document_splitter.py | 34 ++++++++++++++++--- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index adea7cc3ce..6edfda1157 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -1,5 +1,5 @@ from copy import deepcopy -from typing import List, Literal +from typing import Dict, List, Literal, Tuple from more_itertools import windowed @@ -70,10 +70,10 @@ def run(self, documents: List[Document]): f"DocumentSplitter only works with text documents but document.content for document ID {doc.id} is None." ) units = self._split_into_units(doc.content, self.split_by) - text_splits = self._concatenate_units(units, self.split_length, self.split_overlap) + text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap) metadata = deepcopy(doc.meta) metadata["source_id"] = doc.id - split_docs += [Document(content=txt, meta=metadata) for txt in text_splits] + split_docs += self._create_docs_from_splits(text_splits=text_splits, splits_pages=splits_pages) return {"documents": split_docs} def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]: @@ -95,15 +95,39 @@ def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "pa units[i] += split_at return units - def _concatenate_units(self, elements: List[str], split_length: int, split_overlap: int) -> List[str]: + def _concatenate_units( + self, elements: List[str], split_length: int, split_overlap: int + ) -> Tuple[List[str], List[int]]: """ Concatenates the elements into parts of split_length units. """ text_splits = [] + splits_pages = [] + cur_page = 1 segments = windowed(elements, n=split_length, step=split_length - split_overlap) for seg in segments: current_units = [unit for unit in seg if unit is not None] txt = "".join(current_units) if len(txt) > 0: text_splits.append(txt) - return text_splits + splits_pages.append(cur_page) + processed_units = current_units[: split_length - split_overlap] + if self.split_by != "page": + num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units) + else: + num_page_breaks = len(processed_units) + cur_page += num_page_breaks + return text_splits, splits_pages + + def _create_docs_from_splits(self, text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]: + """ + Creates Document objects from text splits enriching them with page number and the metadata of the original document. + """ + documents: List[Document] = [] + + for i, txt in enumerate(text_splits): + meta = deepcopy(meta) + doc = Document(content=txt, meta=meta) + doc.meta["page"] = splits_pages[i] + documents.append(doc) + return documents From 6d8818f955ab8ea82199988f3c081bf8fb9cf3f5 Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Date: Thu, 25 Apr 2024 19:00:25 +0200 Subject: [PATCH 03/14] Add tests that reflect the desired behabiour. This behabiour is inffered from the one it had on Haystack 1.x --- .../preprocessors/test_document_splitter.py | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py index 479f0d50ce..1f8c43c3f2 100644 --- a/test/components/preprocessors/test_document_splitter.py +++ b/test/components/preprocessors/test_document_splitter.py @@ -141,3 +141,100 @@ def test_copy_metadata(self): for doc, split_doc in zip(documents, result["documents"]): assert doc.meta.items() <= split_doc.meta.items() assert split_doc.content == "Text." + + def test_add_page_number_to_metadata_with_no_overlap(self): + # Check for Word split + splitter = DocumentSplitter(split_by="word", split_length=2) + doc1 = Document(content="This is some text.\f This text is on another page.") + doc2 = Document(content="This content has two.\f\f page brakes.") + result = splitter.run(documents=[doc1, doc2]) + + expected_pages = [1, 1, 2, 2, 2, 1, 1, 3] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + # Check for Sentence split + splitter = DocumentSplitter(split_by="sentence", split_length=1) + doc1 = Document(content="This is some text.\f This text is on another page.") + doc2 = Document(content="This content has two.\f\f page brakes.") + result = splitter.run(documents=[doc1, doc2]) + + expected_pages = [1, 1, 1, 1] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + # Check for Passage Split + splitter = DocumentSplitter(split_by="passage", split_length=1) + doc1 = Document( + content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage." + ) + result = splitter.run(documents=[doc1]) + + expected_pages = [1, 2, 2, 2] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + # Check for Page Split + splitter = DocumentSplitter(split_by="page", split_length=1) + doc1 = Document( + content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." + ) + result = splitter.run(documents=[doc1]) + expected_pages = [1, 2, 3] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + splitter = DocumentSplitter(split_by="page", split_length=2) + doc1 = Document( + content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." + ) + result = splitter.run(documents=[doc1]) + expected_pages = [1, 3] + + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + def test_add_page_number_to_metadata_with_overlap(self): + # Check for Word Split + splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1) + doc1 = Document(content="This is some text. And\f this text is on another page.") + doc2 = Document(content="This content has two.\f\f page brakes.") + result = splitter.run(documents=[doc1, doc2]) + + expected_pages = [1, 1, 1, 2, 2, 1, 1, 3] + for doc, p in zip(result["documents"], expected_pages): + print(doc.content, doc.meta, p) + assert doc.meta["page_number"] == p + + # Check for Sentence Split + splitter = DocumentSplitter(split_by="sentence", split_length=2, split_overlap=1) + doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.") + doc2 = Document(content="This content has two.\f\f page brakes. More text.") + result = splitter.run(documents=[doc1, doc2]) + + expected_pages = [1, 1, 1, 2, 1, 1] + for doc, p in zip(result["documents"], expected_pages): + print(doc.content, doc.meta, p) + assert doc.meta["page_number"] == p + + # Check for Passage Split + splitter = DocumentSplitter(split_by="passage", split_length=2, split_overlap=1) + doc1 = Document( + content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage." + ) + result = splitter.run(documents=[doc1]) + + expected_pages = [1, 2, 2] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + # Check for Page Split + splitter = DocumentSplitter(split_by="page", split_length=2, split_overlap=1) + doc1 = Document( + content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." + ) + result = splitter.run(documents=[doc1]) + expected_pages = [1, 2, 3] + + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p From 4be3f1608a945c12326131056a91c433177e83b5 Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Date: Thu, 25 Apr 2024 19:00:25 +0200 Subject: [PATCH 04/14] Add tests that reflect the desired behabiour. This behabiour is inffered from the one it had on Haystack 1.x Solve some minor bugs spotted by tests. --- .../preprocessors/document_splitter.py | 6 +- .../preprocessors/test_document_splitter.py | 97 +++++++++++++++++++ 2 files changed, 101 insertions(+), 2 deletions(-) diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index 6edfda1157..a7ba538059 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -73,7 +73,9 @@ def run(self, documents: List[Document]): text_splits, splits_pages = self._concatenate_units(units, self.split_length, self.split_overlap) metadata = deepcopy(doc.meta) metadata["source_id"] = doc.id - split_docs += self._create_docs_from_splits(text_splits=text_splits, splits_pages=splits_pages) + split_docs += self._create_docs_from_splits( + text_splits=text_splits, splits_pages=splits_pages, meta=metadata + ) return {"documents": split_docs} def _split_into_units(self, text: str, split_by: Literal["word", "sentence", "passage", "page"]) -> List[str]: @@ -128,6 +130,6 @@ def _create_docs_from_splits(self, text_splits: List[str], splits_pages: List[in for i, txt in enumerate(text_splits): meta = deepcopy(meta) doc = Document(content=txt, meta=meta) - doc.meta["page"] = splits_pages[i] + doc.meta["page_number"] = splits_pages[i] documents.append(doc) return documents diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py index 479f0d50ce..1f8c43c3f2 100644 --- a/test/components/preprocessors/test_document_splitter.py +++ b/test/components/preprocessors/test_document_splitter.py @@ -141,3 +141,100 @@ def test_copy_metadata(self): for doc, split_doc in zip(documents, result["documents"]): assert doc.meta.items() <= split_doc.meta.items() assert split_doc.content == "Text." + + def test_add_page_number_to_metadata_with_no_overlap(self): + # Check for Word split + splitter = DocumentSplitter(split_by="word", split_length=2) + doc1 = Document(content="This is some text.\f This text is on another page.") + doc2 = Document(content="This content has two.\f\f page brakes.") + result = splitter.run(documents=[doc1, doc2]) + + expected_pages = [1, 1, 2, 2, 2, 1, 1, 3] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + # Check for Sentence split + splitter = DocumentSplitter(split_by="sentence", split_length=1) + doc1 = Document(content="This is some text.\f This text is on another page.") + doc2 = Document(content="This content has two.\f\f page brakes.") + result = splitter.run(documents=[doc1, doc2]) + + expected_pages = [1, 1, 1, 1] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + # Check for Passage Split + splitter = DocumentSplitter(split_by="passage", split_length=1) + doc1 = Document( + content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage." + ) + result = splitter.run(documents=[doc1]) + + expected_pages = [1, 2, 2, 2] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + # Check for Page Split + splitter = DocumentSplitter(split_by="page", split_length=1) + doc1 = Document( + content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." + ) + result = splitter.run(documents=[doc1]) + expected_pages = [1, 2, 3] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + splitter = DocumentSplitter(split_by="page", split_length=2) + doc1 = Document( + content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." + ) + result = splitter.run(documents=[doc1]) + expected_pages = [1, 3] + + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + def test_add_page_number_to_metadata_with_overlap(self): + # Check for Word Split + splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1) + doc1 = Document(content="This is some text. And\f this text is on another page.") + doc2 = Document(content="This content has two.\f\f page brakes.") + result = splitter.run(documents=[doc1, doc2]) + + expected_pages = [1, 1, 1, 2, 2, 1, 1, 3] + for doc, p in zip(result["documents"], expected_pages): + print(doc.content, doc.meta, p) + assert doc.meta["page_number"] == p + + # Check for Sentence Split + splitter = DocumentSplitter(split_by="sentence", split_length=2, split_overlap=1) + doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.") + doc2 = Document(content="This content has two.\f\f page brakes. More text.") + result = splitter.run(documents=[doc1, doc2]) + + expected_pages = [1, 1, 1, 2, 1, 1] + for doc, p in zip(result["documents"], expected_pages): + print(doc.content, doc.meta, p) + assert doc.meta["page_number"] == p + + # Check for Passage Split + splitter = DocumentSplitter(split_by="passage", split_length=2, split_overlap=1) + doc1 = Document( + content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage." + ) + result = splitter.run(documents=[doc1]) + + expected_pages = [1, 2, 2] + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p + + # Check for Page Split + splitter = DocumentSplitter(split_by="page", split_length=2, split_overlap=1) + doc1 = Document( + content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." + ) + result = splitter.run(documents=[doc1]) + expected_pages = [1, 2, 3] + + for doc, p in zip(result["documents"], expected_pages): + assert doc.meta["page_number"] == p From 88106f2e7f7cf40e137dabb0ce1afca1fafef0d9 Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Date: Thu, 25 Apr 2024 19:06:00 +0200 Subject: [PATCH 05/14] Update docstrings. --- haystack/components/preprocessors/document_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index a7ba538059..17edbe9e36 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -53,7 +53,7 @@ def run(self, documents: List[Document]): :returns: A dictionary with the following key: - `documents`: List of documents with the split texts. A metadata field "source_id" is added to each - document to keep track of the original document that was split. Other metadata are copied from the original + document to keep track of the original document that was split. Another metadata field "page_number" is added to each number to keep track of the page it belonged to in the original document. Other metadata are copied from the original document. :raises TypeError: if the input is not a list of Documents. From d4c627898c8140d2b70c7e0d9e789b2753c2951e Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Date: Thu, 25 Apr 2024 21:50:11 +0200 Subject: [PATCH 06/14] Add reno. --- ...e-number-to-document-splitter-162e9dc7443575f0.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml diff --git a/releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml b/releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml new file mode 100644 index 0000000000..98e045b10d --- /dev/null +++ b/releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml @@ -0,0 +1,10 @@ +--- +highlights: > + Add the "page_number" field to the metadata of all output documents. + +enhancements: + - | + Now the DocumentSplitter adds the "page_number" field to the metadata of all output documents to keep track of the page of the original document it belongs to. +issues: + - | + 6705 From d9b29cc5a392d39d53c8512b4eca9dccdb30a3a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Fern=C3=A1ndez?= <67836662+CarlosFerLo@users.noreply.github.com> Date: Fri, 26 Apr 2024 19:14:32 +0200 Subject: [PATCH 07/14] Update haystack/components/preprocessors/document_splitter.py Update docstring from suggestion Co-authored-by: David S. Batista --- haystack/components/preprocessors/document_splitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index 17edbe9e36..f1a37101ec 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -101,7 +101,7 @@ def _concatenate_units( self, elements: List[str], split_length: int, split_overlap: int ) -> Tuple[List[str], List[int]]: """ - Concatenates the elements into parts of split_length units. + Concatenates the elements into parts of split_length units keeping track of the original page number that each element belongs. """ text_splits = [] splits_pages = [] From bd8eac29568fbda9d00a52c4d8215da7969780d2 Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Date: Fri, 26 Apr 2024 19:15:39 +0200 Subject: [PATCH 08/14] solve suggestion to improve readability --- haystack/components/preprocessors/document_splitter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index f1a37101ec..3761b3e6e8 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -114,10 +114,10 @@ def _concatenate_units( text_splits.append(txt) splits_pages.append(cur_page) processed_units = current_units[: split_length - split_overlap] - if self.split_by != "page": - num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units) - else: + if self.split_by == "page": num_page_breaks = len(processed_units) + else: + num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units) cur_page += num_page_breaks return text_splits, splits_pages From cdb216f6f84ff6f10959784763c4456109ffdf3f Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Date: Fri, 26 Apr 2024 19:55:13 +0200 Subject: [PATCH 09/14] fragment tests --- .../preprocessors/test_document_splitter.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/test/components/preprocessors/test_document_splitter.py b/test/components/preprocessors/test_document_splitter.py index 1f8c43c3f2..4874c25be3 100644 --- a/test/components/preprocessors/test_document_splitter.py +++ b/test/components/preprocessors/test_document_splitter.py @@ -142,8 +142,7 @@ def test_copy_metadata(self): assert doc.meta.items() <= split_doc.meta.items() assert split_doc.content == "Text." - def test_add_page_number_to_metadata_with_no_overlap(self): - # Check for Word split + def test_add_page_number_to_metadata_with_no_overlap_word_split(self): splitter = DocumentSplitter(split_by="word", split_length=2) doc1 = Document(content="This is some text.\f This text is on another page.") doc2 = Document(content="This content has two.\f\f page brakes.") @@ -153,7 +152,7 @@ def test_add_page_number_to_metadata_with_no_overlap(self): for doc, p in zip(result["documents"], expected_pages): assert doc.meta["page_number"] == p - # Check for Sentence split + def test_add_page_number_to_metadata_with_no_overlap_sentence_split(self): splitter = DocumentSplitter(split_by="sentence", split_length=1) doc1 = Document(content="This is some text.\f This text is on another page.") doc2 = Document(content="This content has two.\f\f page brakes.") @@ -163,7 +162,7 @@ def test_add_page_number_to_metadata_with_no_overlap(self): for doc, p in zip(result["documents"], expected_pages): assert doc.meta["page_number"] == p - # Check for Passage Split + def test_add_page_number_to_metadata_with_no_overlap_passage_split(self): splitter = DocumentSplitter(split_by="passage", split_length=1) doc1 = Document( content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage." @@ -174,7 +173,7 @@ def test_add_page_number_to_metadata_with_no_overlap(self): for doc, p in zip(result["documents"], expected_pages): assert doc.meta["page_number"] == p - # Check for Page Split + def test_add_page_number_to_metadata_with_no_overlap_page_split(self): splitter = DocumentSplitter(split_by="page", split_length=1) doc1 = Document( content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." @@ -194,8 +193,7 @@ def test_add_page_number_to_metadata_with_no_overlap(self): for doc, p in zip(result["documents"], expected_pages): assert doc.meta["page_number"] == p - def test_add_page_number_to_metadata_with_overlap(self): - # Check for Word Split + def test_add_page_number_to_metadata_with_overlap_word_split(self): splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1) doc1 = Document(content="This is some text. And\f this text is on another page.") doc2 = Document(content="This content has two.\f\f page brakes.") @@ -206,7 +204,7 @@ def test_add_page_number_to_metadata_with_overlap(self): print(doc.content, doc.meta, p) assert doc.meta["page_number"] == p - # Check for Sentence Split + def test_add_page_number_to_metadata_with_overlap_sentence_split(self): splitter = DocumentSplitter(split_by="sentence", split_length=2, split_overlap=1) doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.") doc2 = Document(content="This content has two.\f\f page brakes. More text.") @@ -217,7 +215,7 @@ def test_add_page_number_to_metadata_with_overlap(self): print(doc.content, doc.meta, p) assert doc.meta["page_number"] == p - # Check for Passage Split + def test_add_page_number_to_metadata_with_overlap_passage_split(self): splitter = DocumentSplitter(split_by="passage", split_length=2, split_overlap=1) doc1 = Document( content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage." @@ -228,7 +226,7 @@ def test_add_page_number_to_metadata_with_overlap(self): for doc, p in zip(result["documents"], expected_pages): assert doc.meta["page_number"] == p - # Check for Page Split + def test_add_page_number_to_metadata_with_overlap_page_split(self): splitter = DocumentSplitter(split_by="page", split_length=2, split_overlap=1) doc1 = Document( content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage." From 72fe37d032fd17dcfd86493f7821582249fcae75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Fern=C3=A1ndez?= <67836662+CarlosFerLo@users.noreply.github.com> Date: Fri, 26 Apr 2024 19:56:29 +0200 Subject: [PATCH 10/14] Update haystack/components/preprocessors/document_splitter.py Co-authored-by: David S. Batista --- haystack/components/preprocessors/document_splitter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index 3761b3e6e8..9b1f7fcdf9 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -121,7 +121,9 @@ def _concatenate_units( cur_page += num_page_breaks return text_splits, splits_pages - def _create_docs_from_splits(self, text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]: + @staticmethod + def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]: + """ Creates Document objects from text splits enriching them with page number and the metadata of the original document. """ From f17acc2bca0f6e52af3c81e494f85f16a199e1e8 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 29 Apr 2024 11:52:35 +0200 Subject: [PATCH 11/14] Update .gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3d5086f12e..637e3f3530 100644 --- a/.gitignore +++ b/.gitignore @@ -161,4 +161,3 @@ haystack/json-schemas # ruff .ruff_cache -.history From 7740efbde81ac6c8adcdedf0738ed235e9e5f2b5 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 29 Apr 2024 11:52:52 +0200 Subject: [PATCH 12/14] Update .gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 637e3f3530..063a0e7661 100644 --- a/.gitignore +++ b/.gitignore @@ -160,4 +160,3 @@ haystack/json-schemas # ruff .ruff_cache - From 4bf78b3e42ed2cc444e3349059e9ed033e1ecdf9 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 29 Apr 2024 11:53:10 +0200 Subject: [PATCH 13/14] Update add-page-number-to-document-splitter-162e9dc7443575f0.yaml --- .../add-page-number-to-document-splitter-162e9dc7443575f0.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml b/releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml index 98e045b10d..8c97663cf1 100644 --- a/releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml +++ b/releasenotes/notes/add-page-number-to-document-splitter-162e9dc7443575f0.yaml @@ -5,6 +5,3 @@ highlights: > enhancements: - | Now the DocumentSplitter adds the "page_number" field to the metadata of all output documents to keep track of the page of the original document it belongs to. -issues: - - | - 6705 From 4903a8e2ac5050688b7bb072981fd5843f1c6b59 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Mon, 29 Apr 2024 12:18:55 +0200 Subject: [PATCH 14/14] blackening --- haystack/components/preprocessors/document_splitter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/haystack/components/preprocessors/document_splitter.py b/haystack/components/preprocessors/document_splitter.py index 9b1f7fcdf9..033f55a89a 100644 --- a/haystack/components/preprocessors/document_splitter.py +++ b/haystack/components/preprocessors/document_splitter.py @@ -123,7 +123,6 @@ def _concatenate_units( @staticmethod def _create_docs_from_splits(text_splits: List[str], splits_pages: List[int], meta: Dict) -> List[Document]: - """ Creates Document objects from text splits enriching them with page number and the metadata of the original document. """