removed inference limit code

2026-01-21 13:41:03 -06:00
parent b29cd6e6f6
commit 65537c4174
1 changed files with 1 additions and 53 deletions
@@ -252,40 +252,6 @@ class OpenAICompatibleInferenceBot(InferenceBot):
                    changed = True
        return changed

-    def _enforce_budget(self, messages):
-        """Normalize and enforce token budget by summarizing only tool-call requests first,
-        then eliding redundant code blocks if still too large. Returns normalized messages.
-        """
-        normalized = self._normalize_messages(messages)
-        limit = self._get_inference_limit()
-        if not limit:
-            return normalized
-        # Reserve space for completion tokens
-        reserve = self.max_tokens if isinstance(self.max_tokens, int) else 1024
-        budget = max(1024, limit - reserve)
-
-        tokens = self._estimate_tokens(normalized)
-        if tokens <= budget:
-            return normalized
-
-        # Step 1: summarize only tool-call request arguments
-        if self._summarize_tool_call_requests_in_messages(normalized):
-            tokens = self._estimate_tokens(normalized)
-            logging.info(f"Applied tool-call request summarization. tokens={tokens}/{budget}")
-            if tokens <= budget:
-                return normalized
-
-        # Step 2: elide redundant code blocks from older assistant messages
-        if self._elide_redundant_code_blocks(normalized):
-            tokens = self._estimate_tokens(normalized)
-            logging.info(f"Elided redundant code blocks. tokens={tokens}/{budget}")
-            if tokens <= budget:
-                return normalized
-
-        # If still over, log and proceed; the API may still reject; caller may choose to abort
-        logging.warning(f"Projected tokens still exceed budget after optimizations: {tokens}/{budget}")
-        return normalized
-
    def get_chat_response(self, messages):
        if not self.client:
            logging.error("OpenAI client not initialized before get_chat_response.")
@@ -308,12 +274,9 @@ class OpenAICompatibleInferenceBot(InferenceBot):
                        func_copy = {k: v for k, v in func.items() if k != "_tags"}
                        cleaned_tools.append(func_copy)

-            # Enforce token budget prior to API call
-            messages_for_api = self._enforce_budget(messages)
-
            response = self.client.chat.completions.create(
                model=self.model, 
-                messages=messages_for_api,
+                messages=messages,
                tools=cleaned_tools,
                tool_choice="auto" if cleaned_tools else None,
                max_tokens=self.max_tokens,
@@ -341,21 +304,6 @@ class OpenAICompatibleInferenceBot(InferenceBot):
        self.conversation_history[user_id].append({"role": "user", "content": user_message})
        messages = list(self.conversation_history[user_id])
        
-        # Pre-inference token limit check with budgeted optimizations
-        limit = self._get_inference_limit()
-        if limit is not None:
-            # Estimate on normalized messages after applying request-only summarization if needed
-            provisional = self._enforce_budget(messages)
-            token_count = self._estimate_tokens(provisional)
-            reserve = self.max_tokens if isinstance(self.max_tokens, int) else 1024
-            budget = max(1024, limit - reserve)
-            if token_count > budget:
-                logging.warning(f"Request for user {user_id} exceeds inference token budget even after optimizations ({token_count}/{budget}).")
-                # Do not persist this message in history as it was not processed by LLM
-                if self.conversation_history[user_id] and self.conversation_history[user_id][-1]["role"] == "user" and self.conversation_history[user_id][-1]["content"] == user_message:
-                    self.conversation_history[user_id].pop()
-                return "Request exceeds inference token limit after optimization. Please shorten your request, use /clear, or implement RAG in your application."
-
        response = self.get_chat_response(messages)
        
        if not (response.choices and response.choices[0].message):