from flask import Flask, render_template, request, jsonify from llama_cpp import Llama import re app = Flask(__name__) # Path to the local GGUF model weights MODEL_PATH = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf" # update this path # Initialize model llm = Llama( model_path=MODEL_PATH, n_ctx=2048, n_threads=8 # adjust based on your CPU ) # Build adaptive prompt def build_prompt(history, user_text): system_prompt = ( "You are a helpful and adaptive assistant. Follow these rules strictly:\n" "- If the user asks a simple or factual question, give a short, precise answer.\n" "- If the user requests a story, essay, or letter, provide a longer, well-structured response.\n" "- If the user asks for programming help or code, provide correct, complete, well-formatted code.\n" "- Always keep answers clear, neat, and structured; use points when helpful.\n" "- Output code inside proper Markdown code blocks with language tags for syntax highlighting.\n" ) prompt = system_prompt + "\n\n" for turn in history: prompt += f"User: {turn['user']}\nAssistant: {turn['assistant']}\n" prompt += f"User: {user_text}\nAssistant:" return prompt @app.route("/") def index(): return render_template("index.html") @app.route("/chat", methods=["POST"]) def chat(): data = request.get_json() user_message = data.get("message") history = data.get("history", []) prompt = build_prompt(history, user_message) # Adjust max_tokens dynamically if any(word in user_message.lower() for word in ["story", "letter", "essay"]): max_out = 800 elif any(word in user_message.lower() for word in ["code", "program", "script", "python", "java", "html", "c++"]): max_out = 1000 else: max_out = 200 resp = llm( prompt, max_tokens=max_out, temperature=0.7, stop=["\nUser:", "\nAssistant:"] ) text = resp["choices"][0]["text"].strip() # Wrap fenced code blocks with copy button (handled in JS) return jsonify({"response": text}) if __name__ == "__main__": app.run(host="0.0.0.0", port=5000, debug=True)