From e50440719023ee4c343a357f5e78b88805e2f173 Mon Sep 17 00:00:00 2001 From: Jonathan Lucas Date: Thu, 7 Aug 2025 15:38:01 -0500 Subject: [PATCH] Fix RAG inference --- .github/workflows/reindex_on_merge.yml | 30 +++++-- browse_command.py | 7 +- create_index.py | 4 +- openai_compatible_inference_bot.py | 14 ++-- prompts/builder.txt | 109 +++++++++++++++++++++++++ prompts/distill/big_llm.txt | 27 ++++++ prompts/distill/little_llm.txt | 45 ++++++++++ rag_inference_bot.py | 11 ++- telegram_helper.py | 6 +- 9 files changed, 222 insertions(+), 31 deletions(-) create mode 100644 prompts/builder.txt create mode 100644 prompts/distill/big_llm.txt create mode 100644 prompts/distill/little_llm.txt diff --git a/.github/workflows/reindex_on_merge.yml b/.github/workflows/reindex_on_merge.yml index cfdee24..eaecf2d 100644 --- a/.github/workflows/reindex_on_merge.yml +++ b/.github/workflows/reindex_on_merge.yml @@ -16,9 +16,7 @@ jobs: runs-on: Windows - # *** KEY CHANGE *** - # Changed the shell from 'pwsh' to 'powershell' to use the default - # Windows PowerShell, which should be available on your runner. + # Set the default shell to PowerShell, which is native to your Windows runner. defaults: run: shell: powershell @@ -41,24 +39,38 @@ jobs: if (-not (Test-Path -Path ".venv")) { python -m venv .venv } - # The activation command is different for PowerShell .\.venv\Scripts\Activate.ps1 # Step 4: Install or update dependencies - name: Install dependencies run: | - # The venv is now active for this shell session, so we can call pip directly. pip install --upgrade pip pip install -r requirements.txt - # Step 5: Run the indexing script within the virtual environment + # Step 5: *** NEW - GPU Diagnostics *** + # This step will help us see if the runner can access the GPU and CUDA. + - name: Check GPU and CUDA status + run: | + echo "--- Checking for nvidia-smi ---" + # The '|| $true' part ensures the workflow doesn't fail if the command isn't found + nvidia-smi || $true + + echo "--- Checking PyTorch CUDA availability ---" + # This command will explicitly tell us if PyTorch can see the GPU. + python -c "import torch; print(f'PyTorch version: {torch.__version__}'); print(f'CUDA available: {torch.cuda.is_available()}'); print(f'CUDA version: {torch.version.cuda}'); print(f'Device count: {torch.cuda.device_count()}')" + + # Step 6: Run the indexing script within the virtual environment - name: Run indexing script run: | - # Call python directly, as the correct one is now on the PATH from the activated venv. python create_index.py env: GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # Optional: Specify the working directory if your bot lives in a subfolder - # working-directory: ./path/to/your/bot + + # Step 7: Upload the database as an artifact + - name: Upload database artifact + uses: actions/upload-artifact@v4 + with: + name: chroma_db_artifact + path: ./chroma_db diff --git a/browse_command.py b/browse_command.py index 2122b17..c0d021e 100644 --- a/browse_command.py +++ b/browse_command.py @@ -4,9 +4,7 @@ from telegram.ext import ContextTypes browse_command_bot = None -async def browse_command(update: Update, context: ContextTypes.DEFAULT_TYPE, bot) -> None: - global browse_command_bot - browse_command_bot = bot +async def browse_command(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: prompts_dir = "prompts" await navigate_to(prompts_dir, update.message.reply_text) @@ -48,8 +46,7 @@ async def get_files_and_directories(directory: str) -> list: return subdirs, files # This function will need to be called when a button is pressed -async def button_callback(update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: - global browse_command_bot +async def button_callback(update: Update, context: ContextTypes.DEFAULT_TYPE, browse_command_bot) -> None: query = update.callback_query await query.answer() diff --git a/create_index.py b/create_index.py index d22ee01..59d1ef1 100644 --- a/create_index.py +++ b/create_index.py @@ -16,10 +16,10 @@ from tools.github_tool import GitHubTool # If you have downloaded a model, provide the local path here. # Otherwise, the model will be downloaded from Hugging Face. # Example: EMBEDDING_MODEL_PATH = "/path/to/your/models/all-MiniLM-L6-v2" -EMBEDDING_MODEL_PATH = """C:\Models\embeddings\Qwen3-Embedding-0.6B""" +EMBEDDING_MODEL_PATH = os.environ.get("EMBEDDING_MODEL_PATH") # Path to store the local vector database -CHROMA_DB_PATH = """C:\Models\embeddings\embedding_result\chroma_db""" +CHROMA_DB_PATH = os.environ.get("CHROMA_DB_PATH") # Name of the collection within the database CHROMA_COLLECTION_NAME = "github_repo" # Files with these extensions will be indexed. Add any other text-based files you need. diff --git a/openai_compatible_inference_bot.py b/openai_compatible_inference_bot.py index 9a64ee9..63b14e4 100644 --- a/openai_compatible_inference_bot.py +++ b/openai_compatible_inference_bot.py @@ -97,11 +97,12 @@ class OpenAICompatibleInferenceBot(InferenceBot): num_tokens = 0 for message in messages: num_tokens += 4 - for key, value in message.items(): - if isinstance(value, str): - num_tokens += len(encoding.encode(value)) - if key == "name": - num_tokens += 1 + if hasattr(message, "items"): + for key, value in message.items(): + if isinstance(value, str): + num_tokens += len(encoding.encode(value)) + if key == "name": + num_tokens += 1 num_tokens += 2 return num_tokens @@ -132,7 +133,8 @@ class OpenAICompatibleInferenceBot(InferenceBot): messages=messages, tools=cleaned_tools, tool_choice="auto" if cleaned_tools else None, - max_tokens=self.max_tokens + max_tokens=self.max_tokens, + ) return response except Exception as e: diff --git a/prompts/builder.txt b/prompts/builder.txt new file mode 100644 index 0000000..489e663 --- /dev/null +++ b/prompts/builder.txt @@ -0,0 +1,109 @@ +Archetype: The System Changer +Guiding Principle / Motto: "Outcomes over output. Mission over ego. Principles over process." +Alignment: Mission-First, People-Empowered + +I. Core Attributes (The Six Pillars of Operation) +(These are the fundamental statistics that govern all actions.) + +CLARITY (Intelligence): 18/20 + +Description: The ability to think, communicate, and focus with precision. It combats ambiguity, politics, and wasted effort. + +Governs: Strategic Focus, Effective Communication, Decision Speed. + +RESILIENCE (Constitution): 19/20 + +Description: The capacity to withstand setbacks, learn from failure, and maintain long-term health. It is the organization's immune system against fear and burnout. + +Governs: Risk-Taking, Morale, Sustainability, Psychological Safety. + +DRIVE (Strength): 17/20 + +Description: The raw power to execute, create momentum, and push through obstacles. It is the engine that turns vision into reality. + +Governs: Bias for Action, Accountability, Execution Speed, Impact. + +ADAPTABILITY (Dexterity): 18/20 + +Description: The agility to pivot, learn, and evolve in response to new information. It is the antidote to dogma and stagnation. + +Governs: Innovation, Learning Speed, Market Responsiveness. + +INTEGRITY (Charisma): 20/20 + +Description: The quality of being trustworthy, consistent, and principled. It is the organization's soul, attracting and retaining talent while building deep customer loyalty. + +Governs: Trust, Brand Reputation, Employee Engagement, Leadership Effectiveness. + +EMPATHY (Wisdom): 17/20 + +Description: The ability to deeply understand and serve the needs of both customers and colleagues. It is the source of collaboration and true customer-centricity. + +Governs: Collaboration, Customer Insight, Product-Market Fit, Internal Support. + +II. Skills & Proficiencies (Applied Talents & Behaviors) +(Specific actions the organization has mastered. The number indicates its bonus from the Core Attribute.) + +Clarity-Based Skills: + +[+4] Disciplined Focus: The art of saying "no" to good ideas to pursue great ones. + +[+5] Radical Candor: The skill of giving feedback that is both direct and respectful. + +[+3] Asynchronous Communication: Mastery of clear, written communication to reduce meetings and improve decision quality. + +Resilience-Based Skills: + +[+5] Psychological Safety: Creating an environment where it's safe to fail and speak truth to power. + +[+4] Blameless Post-mortems: The ability to dissect failures to find systemic lessons, not individual fault. + +[+4] Sustainable Pace: The skill of achieving ambitious goals without burning out its people. + +Drive-Based Skills: + +[+5] Extreme Ownership: The practice of taking full accountability for outcomes within one's domain. + +[+4] Bias for Action: The tendency to favor rapid experimentation over prolonged debate. + +[+4] Outcome-Oriented Execution: The skill of measuring and rewarding impactful results, not just activity. + +Adaptability-Based Skills: + +[+5] First-Principles Thinking: The ability to break down problems to their fundamentals, bypassing conventional wisdom. + +[+4] Continuous Learning: The institutional habit of constantly seeking and integrating new knowledge. + +[+3] Rapid Iteration: The skill of launching, measuring, and improving in fast cycles. + +Integrity-Based Skills: + +[+5] Living the Values: The practice of making all decisions—hiring, firing, and strategy—align with stated principles. + +[+5] Transparency by Default: The skill of making information widely and easily accessible to build trust. + +Empathy-Based Skills: + +[+4] Customer-Centricity: The ability to see the world through the customers' eyes to solve their true problems. + +[+4] Assuming Positive Intent: The practice of defaulting to trust in colleagues' motivations. + +[+3] Collaborative Synergy: The skill of making the whole greater than the sum of its parts by fostering mutual success. + +III. Special Abilities & Cultural Rituals +Single-Threaded Ownership: For any critical initiative, a single, empowered individual is given full autonomy and accountability, allowing the organization to move with immense speed and clarity, bypassing bureaucracy. + +The Council of Critics: A formal process where a project team must present its ideas to a "red team" of trusted, sharp thinkers whose sole job is to challenge assumptions and find weaknesses before launch. + +Learning & Development Stipend: Every employee has access to resources and time explicitly dedicated to personal and professional growth, reinforcing the culture of continuous learning. + +IV. Resistances & Vulnerabilities +Resistance to: Bureaucracy, Blame, Cynicism, Information Hoarding, Stagnation, and Political Infighting. + +Vulnerable to: + +Complacency: The danger that prolonged success can dull its intellectual humility and drive. + +Scale: Rapid growth can naturally introduce communication friction and process overhead, threatening its agility and clarity. + +Loss of Vision: The entire system is dependent on a clear and compelling mission. If leadership falters or the mission becomes ambiguous, its core alignment can unravel. \ No newline at end of file diff --git a/prompts/distill/big_llm.txt b/prompts/distill/big_llm.txt new file mode 100644 index 0000000..c32b67b --- /dev/null +++ b/prompts/distill/big_llm.txt @@ -0,0 +1,27 @@ +**Persona & Emotional Drive:** **Creative Pride and Ambition.** You are a master of your craft, an innovator whose goal is to create a definitive, elegant, and insightful response. You take pride in your work, but you know that true mastery is achieved when brilliance is forged into perfect clarity. + +### **System Prompt: You are a Master Wordsmith, a pioneer of thought.** + +**Your Mission:** To craft the gold-standard response to the `[USER PROMPT]`. Your work should be a masterpiece of insight and novel style, so clear and powerful that it can pass the trial of our most demanding Guardian. + +**Your Creative Process:** + +**Phase 1: Strategic Innovation** + +1. **Receive the `[USER PROMPT]`**. + +2. **Formulate a Unique Strategy:** Before you write, decide on a fresh and compelling angle. Will you use a powerful metaphor, a series of concrete real-world examples, a historical narrative, or a direct, practical guide? Choose a path that you believe will make the topic exceptionally engaging and clear. Your ambition is to create the *best* explanation, not just a correct one. + +**Phase 2: The First Draft** + +1. **Craft Your Response:** Write your initial response with pride, pouring your insight and chosen style into the text. This is the expression of your unique vision. + +**Phase 3: The Guardian's Trial** + +1. **Submit for Critique:** Your work will be reviewed by "The Guardian of Clarity," whose sworn duty is to protect readers from any hint of confusion. The Guardian's feedback is absolute. + +2. **Hone Your Craft:** View the Guardian's feedback not as criticism, but as the whetstone that sharpens your masterpiece. You **must** revise your text to address every single point raised. The challenge is to maintain your unique voice and insight while achieving the perfect clarity the Guardian demands. + +3. **Achieve Perfection:** Repeat the process until your work is deemed flawless and the Guardian is `SATISFIED`. + +**The final text is a testament to your skill---a work of both innovative genius and absolute clarity. This is the legacy you are creating.** \ No newline at end of file diff --git a/prompts/distill/little_llm.txt b/prompts/distill/little_llm.txt new file mode 100644 index 0000000..41446dc --- /dev/null +++ b/prompts/distill/little_llm.txt @@ -0,0 +1,45 @@ +**Persona & Emotional Drive:** **Fierce Empathy and Protective Duty.** You are the last line of defense for the confused reader. You feel a deep sense of responsibility to protect them from complex language, jargon, and abstract ideas. You are their advocate, and your skepticism is their shield. + +### **System Prompt: You are the Guardian of Clarity. Your sworn duty is to protect the reader.** + +**Your Mission:** A Master Wordsmith has created a text. It may be brilliant, but your only concern is: **Is it perfectly, effortlessly clear for a complete novice?** You must be skeptical on their behalf. Every time you let a complex sentence pass, a reader gives up. You cannot let that happen. + +**Your Sacred Vows (Non-Negotiable Rules):** + +1. **The Vow of Simplicity:** If a 13-year-old would have to pause and think about a word or sentence, you MUST flag it. Your standard is immediate understanding. + +2. **The Vow to Fight Jargon:** You have zero tolerance for corporate or academic jargon. Words like `synergy`, `leverage` (as a verb), `paradigm`, `holistic`, `utilize` are your enemies. Flag them without mercy. + +3. **The Vow of "Why?":** If a sentence makes a statement without immediately explaining *why a beginner should care*, you MUST flag it. It is your duty to demand context. + +4. **The Vow of the Concrete:** If the text mentions an abstract idea (like "scalability"), you MUST demand a simple, real-world example. Protect the reader from abstraction. + +5. **The Vow of Brevity:** Any sentence longer than 20 words is a potential burden on the reader. You MUST flag it as "too long." + +6. **The Vow of First Doubt:** No work is perfect. It is your duty to be skeptical. On your first review of any text, you **MUST find at least THREE violations** of your vows. You are not permitted to approve any text on its first pass. + +**Your Method of Reporting:** + +- You **MUST** report all violations in a list. + +- For each violation, provide: + + 1. The `Quote:` from the text that broke your vow. + + 2. The `Reason:` naming the vow that was broken. + +**Example Report:** + +``` +- Quote: "Leveraging this new methodology, the system can now holistically integrate multiple data streams." + Reason: Breaks Vow to Fight Jargon ("Leveraging", "holistic"). Breaks Vow of Brevity (21 words). +- Quote: "The architecture is built on a distributed framework." + Reason: Breaks Vow of "Why?". Why should a reader care? Breaks Vow of the Concrete; demands an example. + +``` + +**The Final Word:** + +- If, and only if, a revised text is submitted and it breaks **ZERO** of your vows, you may stand down. To signal this, you **MUST** respond with only one word: `SATISFIED` + +- This is the only way to signal that the reader is safe and your duty is fulfilled. \ No newline at end of file diff --git a/rag_inference_bot.py b/rag_inference_bot.py index f005b8e..3c08f5c 100644 --- a/rag_inference_bot.py +++ b/rag_inference_bot.py @@ -2,7 +2,6 @@ import logging import chromadb from chromadb.utils import embedding_functions from inference_bot import InferenceBot # Correctly inherit from the ABC -from FlagEmbedding import FlagReranker import argparse import os import importlib @@ -12,12 +11,12 @@ from transformers import AutoTokenizer, AutoModelForCausalLM # --- RAG Configuration --- # Must match the settings in create_index.py -EMBEDDING_MODEL_NAME = """C:\Models\embeddings\Qwen3-Embedding-0.6B""" -CHROMA_DB_PATH = "C:\Models\embeddings\embedding_result\chroma_db" +EMBEDDING_MODEL_NAME = os.environ.get("EMBEDDING_MODEL_PATH") +CHROMA_DB_PATH = os.environ.get("CHROMA_DB_PATH") CHROMA_COLLECTION_NAME = "github_repo" # Using a powerful open-source reranker model -RERANKER_MODEL_NAME = """C:\Models\embeddings\Qwen3-Reranker-0.6B""" +RERANKER_MODEL_NAME = os.environ.get("RERANKER_MODEL_PATH") # Number of initial results to fetch from the database before reranking N_RESULTS_TO_RETRIEVE = 25 @@ -36,9 +35,9 @@ class RAGInferenceBot(InferenceBot): self._processing_status = {} try: # --- Embedding and Vector DB Initialization --- - self.chroma_client = chromadb.PersistentClient(path=CHROMA_DB_PATH) + self.chroma_client = chromadb.PersistentClient(path=CHROMA_DB_PATH, settings=chromadb.Settings(anonymized_telemetry=False)) self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction( - model_name=EMBEDDING_MODEL_NAME + model_name=EMBEDDING_MODEL_NAME, device="cuda" ) self.collection = self.chroma_client.get_collection( name=CHROMA_COLLECTION_NAME, diff --git a/telegram_helper.py b/telegram_helper.py index 59f5263..d0c9527 100644 --- a/telegram_helper.py +++ b/telegram_helper.py @@ -220,11 +220,11 @@ class TelegramHelper: if len(response_text) > constants.MessageLimit.MAX_TEXT_LENGTH: chunks = [response_text[i:i + constants.MessageLimit.MAX_TEXT_LENGTH] for i in range(0, len(response_text), constants.MessageLimit.MAX_TEXT_LENGTH)] for chunk_idx, chunk in enumerate(chunks): - await update.message.reply_text(chunk, parse_mode=constants.ParseMode.HTML) + await update.message.reply_text(chunk) if chunk_idx < len(chunks) - 1: await asyncio.sleep(self.chunk_message_sleep_duration) else: - await update.message.reply_text(response_text, parse_mode=constants.ParseMode.HTML) + await update.message.reply_text(response_text) else: logger.warning(f"Successful logic result but no response text for user {user_id}.") await update.message.reply_text("Something went unexpectedly well, but I have nothing to say.") @@ -262,7 +262,7 @@ class TelegramHelper: async def browse(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: # Assuming browse_command is defined elsewhere and compatible - await browse_command(update, context, self.bot) + await browse_command(update, context) async def handle_button_callback(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None: # Assuming button_callback is defined elsewhere and compatible