|
from flask import Flask, render_template, request, jsonify
|
|
from llama_cpp import Llama
|
|
import os
|
|
|
|
app = Flask(__name__)
|
|
|
|
|
|
MODEL_PATH = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf"
|
|
|
|
|
|
|
|
try:
|
|
print("Trying GPU offload...")
|
|
llm = Llama(
|
|
model_path=MODEL_PATH,
|
|
n_ctx=2048,
|
|
n_threads=os.cpu_count(),
|
|
n_gpu_layers=40
|
|
)
|
|
print("GPU initialized successfully.")
|
|
except Exception as e:
|
|
print(f"GPU failed: {e}\nFalling back to CPU.")
|
|
llm = Llama(
|
|
model_path=MODEL_PATH,
|
|
n_ctx=2048,
|
|
n_threads=os.cpu_count(),
|
|
n_gpu_layers=0
|
|
)
|
|
|
|
def build_prompt(history, user_text):
|
|
system_prompt = (
|
|
"You are a helpful assistant. Follow these:\n"
|
|
"- Simple Q: Short, precise.\n"
|
|
"- Story/letter/essay: Longer answer.\n"
|
|
"- Code: Complete, neat, Markdown fenced code with language tag.\n"
|
|
"- Use points when helpful.\n"
|
|
)
|
|
prompt = system_prompt + "\n\n"
|
|
for turn in history:
|
|
prompt += f"User: {turn['user']}\nAssistant: {turn['assistant']}\n"
|
|
prompt += f"User: {user_text}\nAssistant:"
|
|
return prompt
|
|
|
|
@app.route("/")
|
|
def index():
|
|
return render_template("index.html")
|
|
|
|
@app.route("/chat", methods=["POST"])
|
|
def chat():
|
|
data = request.get_json()
|
|
user_message = data.get("message")
|
|
history = data.get("history", [])
|
|
|
|
prompt = build_prompt(history, user_message)
|
|
|
|
if any(word in user_message.lower() for word in ["story", "letter", "essay"]):
|
|
max_out = 800
|
|
elif any(word in user_message.lower() for word in ["code", "program", "script", "python", "java", "html", "c++"]):
|
|
max_out = 1000
|
|
else:
|
|
max_out = 200
|
|
|
|
resp = llm(
|
|
prompt,
|
|
max_tokens=max_out,
|
|
temperature=0.7,
|
|
stop=["\nUser:", "\nAssistant:"]
|
|
)
|
|
|
|
return jsonify({"response": resp["choices"][0]["text"].strip()})
|
|
|
|
if __name__ == "__main__":
|
|
app.run(host="0.0.0.0", port=5000, debug=True)
|
|
|