diff --git a/openai_compatible_inference_bot.py b/openai_compatible_inference_bot.py index 01d3ca7..b20b740 100644 --- a/openai_compatible_inference_bot.py +++ b/openai_compatible_inference_bot.py @@ -252,40 +252,6 @@ class OpenAICompatibleInferenceBot(InferenceBot): changed = True return changed - def _enforce_budget(self, messages): - """Normalize and enforce token budget by summarizing only tool-call requests first, - then eliding redundant code blocks if still too large. Returns normalized messages. - """ - normalized = self._normalize_messages(messages) - limit = self._get_inference_limit() - if not limit: - return normalized - # Reserve space for completion tokens - reserve = self.max_tokens if isinstance(self.max_tokens, int) else 1024 - budget = max(1024, limit - reserve) - - tokens = self._estimate_tokens(normalized) - if tokens <= budget: - return normalized - - # Step 1: summarize only tool-call request arguments - if self._summarize_tool_call_requests_in_messages(normalized): - tokens = self._estimate_tokens(normalized) - logging.info(f"Applied tool-call request summarization. tokens={tokens}/{budget}") - if tokens <= budget: - return normalized - - # Step 2: elide redundant code blocks from older assistant messages - if self._elide_redundant_code_blocks(normalized): - tokens = self._estimate_tokens(normalized) - logging.info(f"Elided redundant code blocks. tokens={tokens}/{budget}") - if tokens <= budget: - return normalized - - # If still over, log and proceed; the API may still reject; caller may choose to abort - logging.warning(f"Projected tokens still exceed budget after optimizations: {tokens}/{budget}") - return normalized - def get_chat_response(self, messages): if not self.client: logging.error("OpenAI client not initialized before get_chat_response.") @@ -308,12 +274,9 @@ class OpenAICompatibleInferenceBot(InferenceBot): func_copy = {k: v for k, v in func.items() if k != "_tags"} cleaned_tools.append(func_copy) - # Enforce token budget prior to API call - messages_for_api = self._enforce_budget(messages) - response = self.client.chat.completions.create( model=self.model, - messages=messages_for_api, + messages=messages, tools=cleaned_tools, tool_choice="auto" if cleaned_tools else None, max_tokens=self.max_tokens, @@ -341,21 +304,6 @@ class OpenAICompatibleInferenceBot(InferenceBot): self.conversation_history[user_id].append({"role": "user", "content": user_message}) messages = list(self.conversation_history[user_id]) - # Pre-inference token limit check with budgeted optimizations - limit = self._get_inference_limit() - if limit is not None: - # Estimate on normalized messages after applying request-only summarization if needed - provisional = self._enforce_budget(messages) - token_count = self._estimate_tokens(provisional) - reserve = self.max_tokens if isinstance(self.max_tokens, int) else 1024 - budget = max(1024, limit - reserve) - if token_count > budget: - logging.warning(f"Request for user {user_id} exceeds inference token budget even after optimizations ({token_count}/{budget}).") - # Do not persist this message in history as it was not processed by LLM - if self.conversation_history[user_id] and self.conversation_history[user_id][-1]["role"] == "user" and self.conversation_history[user_id][-1]["content"] == user_message: - self.conversation_history[user_id].pop() - return "Request exceeds inference token limit after optimization. Please shorten your request, use /clear, or implement RAG in your application." - response = self.get_chat_response(messages) if not (response.choices and response.choices[0].message):