v0.58 - fixes

FlyingFathead · Mar 2, 2024 · fad5d29 · fad5d29
1 parent f1a6b08
commit fad5d29
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -21,6 +21,7 @@ configparser>=6.0.0
 httpx>=0.25.2
 langdetect>=1.0.9
 matplotlib>=3.8.2
+nltk>=3.8.1
 openai>=1.6.1
 pydub>=0.25.1
 python-telegram-bot>=20.7
@@ -51,6 +52,7 @@ timezonefinder>=6.4.0
 - Use the `configmerger.py` to update old configuration files into a newer version's `config.ini`. You can do this by creating a copy of your existing config to i.e. a file named `myconfig.txt` and including in it the lines you want to keep for the newer version. Then, just run `python configmerger.py config.ini myconfig.txt` and all your existing config lines will be migrated to the new one. Works in most cases, but remember to be careful and double-check any migration issues with i.e. `diff`!
 
 # Changelog
+- v0.58 - chunking, parsing and other small fixes
 - v0.57.5 - changes made to Perplexity API handling; new sonar-online models
 - v0.57.1 - improved fallbacks on external API calls like Perplexity API
 - v0.57 - improved error catching & failsafe fallbacks

diff --git a/api_perplexity_search.py b/api_perplexity_search.py
@@ -3,6 +3,7 @@
 # https://github.com/FlyingFathead/TelegramBot-OpenAI-API/
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+import nltk
 import re
 import openai
 import httpx
@@ -20,7 +21,7 @@
 
 # Global variable for chunk size
 # Set this value as needed
-CHUNK_SIZE = 300
+CHUNK_SIZE = 350
 
 # Assuming you've set PERPLEXITY_API_KEY in your environment variables
 PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
@@ -253,7 +254,7 @@ async def translate_response_chunked(bot, user_message, perplexity_response, con
  translated_response += chunk
  else:
  # If the chunk starts with a dash or matches the numbered list pattern, prepend it with a newline
- if chunk.startswith("-") or numbered_list_pattern.match(chunk):
+ if chunk.startswith("-") or numbered_list_pattern.match(chunk) or chunk.startswith("###"):
  translated_response += "\n" + chunk
  # Otherwise, join it with a space
  else:
@@ -270,7 +271,7 @@ async def translate_response_chunked(bot, user_message, perplexity_response, con
 def safe_strip(value):
  return value.strip() if value else value
 
-# smart chunking (v1.04)
+# smart chunking (v1.09)
 def smart_chunk(text, chunk_size=CHUNK_SIZE):
  chunks = []
  start_index = 0
@@ -321,6 +322,57 @@ def markdown_to_html(md_text):
 # > archived code below, to be removed ...
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+# nltk-tryouts // smart chunking (v1.06)
+""" def smart_chunk(text, chunk_size=CHUNK_SIZE):
+ chunks = []
+ sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
+
+ # Split the text into sentences
+ sentences = sentence_tokenizer.tokenize(text)
+
+ # Iterate through sentences
+ for sentence in sentences:
+ # Check if sentence length exceeds chunk size
+ if len(sentence) > chunk_size:
+ # Look for the next full stop or list item indicator
+ next_stop = sentence.find(".", 0)
+ next_list_item = sentence.find("-", 0) if "-" in sentence else sentence.find("*", 0)
+
+ # Split only until the next full stop or list item indicator
+ split_point = min(next_stop, next_list_item) if next_stop > -1 or next_list_item > -1 else len(sentence)
+ chunk = sentence[:split_point].strip()
+
+ # If there's remaining text, handle it recursively
+ remaining_text = sentence[split_point:].strip()
+ if remaining_text:
+ chunks.extend(smart_chunk(remaining_text, chunk_size))
+ else:
+ # If sentence length is smaller than chunk size, add it directly
+ chunks.append(sentence.strip())
+
+ return chunks """
+
+# nltk-tryouts // smart chunking (v1.05)
+""" def smart_chunk(text, chunk_size=CHUNK_SIZE):
+ chunks = []
+ sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # Replace with your preferred library
+
+ # Split the text into sentences
+ sentences = sentence_tokenizer.tokenize(text)
+
+ # Iterate through sentences
+ for sentence in sentences:
+ # Check if the sentence length exceeds chunk size
+ if len(sentence) > chunk_size:
+ # Split the sentence into smaller chunks
+ for sub_sentence in nltk.sent_tokenize(sentence, language='english'): # Replace with your preferred library
+ chunks.append(sub_sentence.strip())
+ else:
+ # If sentence length is smaller than chunk size, add it directly
+ chunks.append(sentence.strip())
+
+ return chunks """
+
 # OLD // deprecated // perplexity 70b query
 """ async def query_pplx_70b_online(prompt):
  PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY") # Ensure this is securely set

diff --git a/main.py b/main.py
@@ -5,7 +5,7 @@
 # https://github.com/FlyingFathead/TelegramBot-OpenAI-API
 #
 # version of this program
-version_number = "0.57.5"
+version_number = "0.58"
 
 # experimental modules
 import requests

diff --git a/requirements.txt b/requirements.txt
@@ -2,6 +2,7 @@ configparser>=6.0.0
 httpx>=0.25.2
 langdetect>=1.0.9
 matplotlib>=3.8.2
+nltk>=3.8.1
 openai>=1.6.1
 pydub>=0.25.1
 python-telegram-bot>=20.7