Skip to content

Commit

Permalink
v0.58 - fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
FlyingFathead committed Mar 2, 2024
1 parent f1a6b08 commit fad5d29
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 4 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ configparser>=6.0.0
httpx>=0.25.2
langdetect>=1.0.9
matplotlib>=3.8.2
nltk>=3.8.1
openai>=1.6.1
pydub>=0.25.1
python-telegram-bot>=20.7
Expand Down Expand Up @@ -51,6 +52,7 @@ timezonefinder>=6.4.0
- Use the `configmerger.py` to update old configuration files into a newer version's `config.ini`. You can do this by creating a copy of your existing config to i.e. a file named `myconfig.txt` and including in it the lines you want to keep for the newer version. Then, just run `python configmerger.py config.ini myconfig.txt` and all your existing config lines will be migrated to the new one. Works in most cases, but remember to be careful and double-check any migration issues with i.e. `diff`!

# Changelog
- v0.58 - chunking, parsing and other small fixes
- v0.57.5 - changes made to Perplexity API handling; new sonar-online models
- v0.57.1 - improved fallbacks on external API calls like Perplexity API
- v0.57 - improved error catching & failsafe fallbacks
Expand Down
58 changes: 55 additions & 3 deletions api_perplexity_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# https://github.com/FlyingFathead/TelegramBot-OpenAI-API/
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

import nltk
import re
import openai
import httpx
Expand All @@ -20,7 +21,7 @@

# Global variable for chunk size
# Set this value as needed
CHUNK_SIZE = 300
CHUNK_SIZE = 350

# Assuming you've set PERPLEXITY_API_KEY in your environment variables
PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
Expand Down Expand Up @@ -253,7 +254,7 @@ async def translate_response_chunked(bot, user_message, perplexity_response, con
translated_response += chunk
else:
# If the chunk starts with a dash or matches the numbered list pattern, prepend it with a newline
if chunk.startswith("-") or numbered_list_pattern.match(chunk):
if chunk.startswith("-") or numbered_list_pattern.match(chunk) or chunk.startswith("###"):
translated_response += "\n" + chunk
# Otherwise, join it with a space
else:
Expand All @@ -270,7 +271,7 @@ async def translate_response_chunked(bot, user_message, perplexity_response, con
def safe_strip(value):
return value.strip() if value else value

# smart chunking (v1.04)
# smart chunking (v1.09)
def smart_chunk(text, chunk_size=CHUNK_SIZE):
chunks = []
start_index = 0
Expand Down Expand Up @@ -321,6 +322,57 @@ def markdown_to_html(md_text):
# > archived code below, to be removed ...
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# nltk-tryouts // smart chunking (v1.06)
""" def smart_chunk(text, chunk_size=CHUNK_SIZE):
chunks = []
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# Split the text into sentences
sentences = sentence_tokenizer.tokenize(text)
# Iterate through sentences
for sentence in sentences:
# Check if sentence length exceeds chunk size
if len(sentence) > chunk_size:
# Look for the next full stop or list item indicator
next_stop = sentence.find(".", 0)
next_list_item = sentence.find("-", 0) if "-" in sentence else sentence.find("*", 0)
# Split only until the next full stop or list item indicator
split_point = min(next_stop, next_list_item) if next_stop > -1 or next_list_item > -1 else len(sentence)
chunk = sentence[:split_point].strip()
# If there's remaining text, handle it recursively
remaining_text = sentence[split_point:].strip()
if remaining_text:
chunks.extend(smart_chunk(remaining_text, chunk_size))
else:
# If sentence length is smaller than chunk size, add it directly
chunks.append(sentence.strip())
return chunks """

# nltk-tryouts // smart chunking (v1.05)
""" def smart_chunk(text, chunk_size=CHUNK_SIZE):
chunks = []
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # Replace with your preferred library
# Split the text into sentences
sentences = sentence_tokenizer.tokenize(text)
# Iterate through sentences
for sentence in sentences:
# Check if the sentence length exceeds chunk size
if len(sentence) > chunk_size:
# Split the sentence into smaller chunks
for sub_sentence in nltk.sent_tokenize(sentence, language='english'): # Replace with your preferred library
chunks.append(sub_sentence.strip())
else:
# If sentence length is smaller than chunk size, add it directly
chunks.append(sentence.strip())
return chunks """

# OLD // deprecated // perplexity 70b query
""" async def query_pplx_70b_online(prompt):
PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY") # Ensure this is securely set
Expand Down
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# https://github.com/FlyingFathead/TelegramBot-OpenAI-API
#
# version of this program
version_number = "0.57.5"
version_number = "0.58"

# experimental modules
import requests
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ configparser>=6.0.0
httpx>=0.25.2
langdetect>=1.0.9
matplotlib>=3.8.2
nltk>=3.8.1
openai>=1.6.1
pydub>=0.25.1
python-telegram-bot>=20.7
Expand Down

0 comments on commit fad5d29

Please sign in to comment.