from flask import Flask, render_template, request, jsonify from llama_cpp import Llama import os app = Flask(__name__) # Update this path to your downloaded model weight MODEL_PATH = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf" # Detect GPU automatically: if llama-cpp-python was compiled with CUDA/Metal and GPU layers can be offloaded # Adjust n_gpu_layers for your GPU memory; 20-40 for mid GPUs, 60-100 for higher VRAM, 0 = CPU only try: print("Trying GPU offload...") llm = Llama( model_path=MODEL_PATH, n_ctx=2048, n_threads=os.cpu_count(), n_gpu_layers=40 # increase or decrease based on your GPU memory ) print("GPU initialized successfully.") except Exception as e: print(f"GPU failed: {e}\nFalling back to CPU.") llm = Llama( model_path=MODEL_PATH, n_ctx=2048, n_threads=os.cpu_count(), n_gpu_layers=0 # CPU only ) def build_prompt(history, user_text): system_prompt = ( "You are a helpful assistant. Follow these:\n" "- Simple Q: Short, precise.\n" "- Story/letter/essay: Longer answer.\n" "- Code: Complete, neat, Markdown fenced code with language tag.\n" "- Use points when helpful.\n" ) prompt = system_prompt + "\n\n" for turn in history: prompt += f"User: {turn['user']}\nAssistant: {turn['assistant']}\n" prompt += f"User: {user_text}\nAssistant:" return prompt @app.route("/") def index(): return render_template("index.html") @app.route("/chat", methods=["POST"]) def chat(): data = request.get_json() user_message = data.get("message") history = data.get("history", []) prompt = build_prompt(history, user_message) if any(word in user_message.lower() for word in ["story", "letter", "essay"]): max_out = 800 elif any(word in user_message.lower() for word in ["code", "program", "script", "python", "java", "html", "c++"]): max_out = 1000 else: max_out = 200 resp = llm( prompt, max_tokens=max_out, temperature=0.7, stop=["\nUser:", "\nAssistant:"] ) return jsonify({"response": resp["choices"][0]["text"].strip()}) if __name__ == "__main__": app.run(host="0.0.0.0", port=5000, debug=True)