Fix passing flags to re.split to break org, md content by heading level

`re.MULTILINE' should be passed to the `flags' argument, not the `max_splits' argument of the `re.split' func This was messing up the indexing by only allowing a maximum of re.MULTILINE splits. Fixing this improves the search quality to previous state
khoj-ai · Apr 3, 2024 · 7809aff · 7809aff
1 parent f68bfaf
commit 7809aff
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 3 deletions.
diff --git a/src/khoj/processor/content/markdown/markdown_to_entries.py b/src/khoj/processor/content/markdown/markdown_to_entries.py
@@ -87,7 +87,7 @@ def process_single_markdown_file(
 
  # If content is small or content has no children headings, save it as a single entry
  if len(TextToEntries.tokenizer(markdown_content_with_ancestry)) <= max_tokens or not re.search(
- rf"^#{{{len(ancestry)+1},}}\s", markdown_content, re.MULTILINE
+ rf"^#{{{len(ancestry)+1},}}\s", markdown_content, flags=re.MULTILINE
  ):
  entry_to_file_map += [(markdown_content_with_ancestry, markdown_file)]
  entries.extend([markdown_content_with_ancestry])
@@ -98,7 +98,7 @@ def process_single_markdown_file(
  sections: List[str] = []
  while len(sections) < 2:
  next_heading_level += 1
- sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, re.MULTILINE)
+ sections = re.split(rf"(\n|^)(?=[#]{{{next_heading_level}}} .+\n?)", markdown_content, flags=re.MULTILINE)
 
  for section in sections:
  # Skip empty sections

diff --git a/src/khoj/processor/content/org_mode/org_to_entries.py b/src/khoj/processor/content/org_mode/org_to_entries.py
@@ -114,7 +114,7 @@ def process_single_org_file(
  sections: List[str] = []
  while len(sections) < 2:
  next_heading_level += 1
- sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, re.MULTILINE)
+ sections = re.split(rf"(\n|^)(?=[*]{{{next_heading_level}}} .+\n?)", org_content, flags=re.MULTILINE)
 
  # Recurse down each non-empty section after parsing its body, heading and ancestry
  for section in sections: