Skip to content

Commit

Permalink
Fix passing flags to re.split to break org, md content by heading level
Browse files Browse the repository at this point in the history
`re.MULTILINE' should be passed to the `flags' argument, not the
`max_splits' argument of the `re.split' func

This was messing up the indexing by only allowing a maximum of
re.MULTILINE splits. Fixing this improves the search quality to
previous state
  • Loading branch information
debanjum committed Apr 3, 2024
1 parent f68bfaf commit 7809aff
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 3 deletions.
4 changes: 2 additions & 2 deletions src/khoj/processor/content/markdown/markdown_to_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def process_single_markdown_file(

# If content is small or content has no children headings, save it as a single entry
if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search(
rf"^#{{{len(ancestry)+1},}}\s", markdown_content, re.MULTILINE
rf"^#{{{len(ancestry)+1},}}\s", markdown_content, flags=re.MULTILINE
):
entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)]
entries.extend([markdown_content_with_ancestry])
Expand All @@ -98,7 +98,7 @@ def process_single_markdown_file(
sections: List[str] = []
while len(sections) < 2:
next_heading_level += 1
sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, re.MULTILINE)
sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, flags=re.MULTILINE)

for section in sections:
# Skip empty sections
Expand Down
2 changes: 1 addition & 1 deletion src/khoj/processor/content/org_mode/org_to_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def process_single_org_file(
sections: List[str] = []
while len(sections) < 2:
next_heading_level += 1
sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, re.MULTILINE)
sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE)

# Recurse down each non-empty section after parsing its body, heading and ancestry
for section in sections:
Expand Down

0 comments on commit 7809aff

Please sign in to comment.