Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add page_number to metadata in DocumentSplitter #7599

Merged
merged 17 commits into from
Apr 29, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fragment tests
  • Loading branch information
CarlosFerLo committed Apr 26, 2024
commit cdb216f6f84ff6f10959784763c4456109ffdf3f
18 changes: 8 additions & 10 deletions test/components/preprocessors/test_document_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,7 @@ def test_copy_metadata(self):
assert doc.meta.items() <= split_doc.meta.items()
assert split_doc.content == "Text."

def test_add_page_number_to_metadata_with_no_overlap(self):
# Check for Word split
def test_add_page_number_to_metadata_with_no_overlap_word_split(self):
splitter = DocumentSplitter(split_by="word", split_length=2)
doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
Expand All @@ -153,7 +152,7 @@ def test_add_page_number_to_metadata_with_no_overlap(self):
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

# Check for Sentence split
def test_add_page_number_to_metadata_with_no_overlap_sentence_split(self):
splitter = DocumentSplitter(split_by="sentence", split_length=1)
doc1 = Document(content="This is some text.\f This text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
Expand All @@ -163,7 +162,7 @@ def test_add_page_number_to_metadata_with_no_overlap(self):
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

# Check for Passage Split
def test_add_page_number_to_metadata_with_no_overlap_passage_split(self):
splitter = DocumentSplitter(split_by="passage", split_length=1)
doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
Expand All @@ -174,7 +173,7 @@ def test_add_page_number_to_metadata_with_no_overlap(self):
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

# Check for Page Split
def test_add_page_number_to_metadata_with_no_overlap_page_split(self):
splitter = DocumentSplitter(split_by="page", split_length=1)
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
Expand All @@ -194,8 +193,7 @@ def test_add_page_number_to_metadata_with_no_overlap(self):
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

def test_add_page_number_to_metadata_with_overlap(self):
# Check for Word Split
def test_add_page_number_to_metadata_with_overlap_word_split(self):
splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=1)
doc1 = Document(content="This is some text. And\f this text is on another page.")
doc2 = Document(content="This content has two.\f\f page brakes.")
Expand All @@ -206,7 +204,7 @@ def test_add_page_number_to_metadata_with_overlap(self):
print(doc.content, doc.meta, p)
assert doc.meta["page_number"] == p

# Check for Sentence Split
def test_add_page_number_to_metadata_with_overlap_sentence_split(self):
splitter = DocumentSplitter(split_by="sentence", split_length=2, split_overlap=1)
doc1 = Document(content="This is some text. And this is more text.\f This text is on another page. End.")
doc2 = Document(content="This content has two.\f\f page brakes. More text.")
Expand All @@ -217,7 +215,7 @@ def test_add_page_number_to_metadata_with_overlap(self):
print(doc.content, doc.meta, p)
assert doc.meta["page_number"] == p

# Check for Passage Split
def test_add_page_number_to_metadata_with_overlap_passage_split(self):
splitter = DocumentSplitter(split_by="passage", split_length=2, split_overlap=1)
doc1 = Document(
content="This is a text with some words.\f There is a second sentence.\n\nAnd there is a third sentence.\n\nAnd more passages.\n\n\f And another passage."
Expand All @@ -228,7 +226,7 @@ def test_add_page_number_to_metadata_with_overlap(self):
for doc, p in zip(result["documents"], expected_pages):
assert doc.meta["page_number"] == p

# Check for Page Split
def test_add_page_number_to_metadata_with_overlap_page_split(self):
splitter = DocumentSplitter(split_by="page", split_length=2, split_overlap=1)
doc1 = Document(
content="This is a text with some words. There is a second sentence.\f And there is a third sentence.\f And another passage."
Expand Down