removed inference limit code
This commit is contained in:
@@ -252,40 +252,6 @@ class OpenAICompatibleInferenceBot(InferenceBot):
|
|||||||
changed = True
|
changed = True
|
||||||
return changed
|
return changed
|
||||||
|
|
||||||
def _enforce_budget(self, messages):
|
|
||||||
"""Normalize and enforce token budget by summarizing only tool-call requests first,
|
|
||||||
then eliding redundant code blocks if still too large. Returns normalized messages.
|
|
||||||
"""
|
|
||||||
normalized = self._normalize_messages(messages)
|
|
||||||
limit = self._get_inference_limit()
|
|
||||||
if not limit:
|
|
||||||
return normalized
|
|
||||||
# Reserve space for completion tokens
|
|
||||||
reserve = self.max_tokens if isinstance(self.max_tokens, int) else 1024
|
|
||||||
budget = max(1024, limit - reserve)
|
|
||||||
|
|
||||||
tokens = self._estimate_tokens(normalized)
|
|
||||||
if tokens <= budget:
|
|
||||||
return normalized
|
|
||||||
|
|
||||||
# Step 1: summarize only tool-call request arguments
|
|
||||||
if self._summarize_tool_call_requests_in_messages(normalized):
|
|
||||||
tokens = self._estimate_tokens(normalized)
|
|
||||||
logging.info(f"Applied tool-call request summarization. tokens={tokens}/{budget}")
|
|
||||||
if tokens <= budget:
|
|
||||||
return normalized
|
|
||||||
|
|
||||||
# Step 2: elide redundant code blocks from older assistant messages
|
|
||||||
if self._elide_redundant_code_blocks(normalized):
|
|
||||||
tokens = self._estimate_tokens(normalized)
|
|
||||||
logging.info(f"Elided redundant code blocks. tokens={tokens}/{budget}")
|
|
||||||
if tokens <= budget:
|
|
||||||
return normalized
|
|
||||||
|
|
||||||
# If still over, log and proceed; the API may still reject; caller may choose to abort
|
|
||||||
logging.warning(f"Projected tokens still exceed budget after optimizations: {tokens}/{budget}")
|
|
||||||
return normalized
|
|
||||||
|
|
||||||
def get_chat_response(self, messages):
|
def get_chat_response(self, messages):
|
||||||
if not self.client:
|
if not self.client:
|
||||||
logging.error("OpenAI client not initialized before get_chat_response.")
|
logging.error("OpenAI client not initialized before get_chat_response.")
|
||||||
@@ -308,12 +274,9 @@ class OpenAICompatibleInferenceBot(InferenceBot):
|
|||||||
func_copy = {k: v for k, v in func.items() if k != "_tags"}
|
func_copy = {k: v for k, v in func.items() if k != "_tags"}
|
||||||
cleaned_tools.append(func_copy)
|
cleaned_tools.append(func_copy)
|
||||||
|
|
||||||
# Enforce token budget prior to API call
|
|
||||||
messages_for_api = self._enforce_budget(messages)
|
|
||||||
|
|
||||||
response = self.client.chat.completions.create(
|
response = self.client.chat.completions.create(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
messages=messages_for_api,
|
messages=messages,
|
||||||
tools=cleaned_tools,
|
tools=cleaned_tools,
|
||||||
tool_choice="auto" if cleaned_tools else None,
|
tool_choice="auto" if cleaned_tools else None,
|
||||||
max_tokens=self.max_tokens,
|
max_tokens=self.max_tokens,
|
||||||
@@ -341,21 +304,6 @@ class OpenAICompatibleInferenceBot(InferenceBot):
|
|||||||
self.conversation_history[user_id].append({"role": "user", "content": user_message})
|
self.conversation_history[user_id].append({"role": "user", "content": user_message})
|
||||||
messages = list(self.conversation_history[user_id])
|
messages = list(self.conversation_history[user_id])
|
||||||
|
|
||||||
# Pre-inference token limit check with budgeted optimizations
|
|
||||||
limit = self._get_inference_limit()
|
|
||||||
if limit is not None:
|
|
||||||
# Estimate on normalized messages after applying request-only summarization if needed
|
|
||||||
provisional = self._enforce_budget(messages)
|
|
||||||
token_count = self._estimate_tokens(provisional)
|
|
||||||
reserve = self.max_tokens if isinstance(self.max_tokens, int) else 1024
|
|
||||||
budget = max(1024, limit - reserve)
|
|
||||||
if token_count > budget:
|
|
||||||
logging.warning(f"Request for user {user_id} exceeds inference token budget even after optimizations ({token_count}/{budget}).")
|
|
||||||
# Do not persist this message in history as it was not processed by LLM
|
|
||||||
if self.conversation_history[user_id] and self.conversation_history[user_id][-1]["role"] == "user" and self.conversation_history[user_id][-1]["content"] == user_message:
|
|
||||||
self.conversation_history[user_id].pop()
|
|
||||||
return "Request exceeds inference token limit after optimization. Please shorten your request, use /clear, or implement RAG in your application."
|
|
||||||
|
|
||||||
response = self.get_chat_response(messages)
|
response = self.get_chat_response(messages)
|
||||||
|
|
||||||
if not (response.choices and response.choices[0].message):
|
if not (response.choices and response.choices[0].message):
|
||||||
|
|||||||
Reference in New Issue
Block a user