Upload 6 files
Browse files- README.md +52 -0
- app.py +66 -0
- download_model.py +46 -0
- inference.py +22 -0
- requirements.txt +3 -0
- templates/index.html +73 -0
README.md
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Local GGUF Chat (Q2_K_L) — Run on CPU (16GB RAM)
|
2 |
+
|
3 |
+
This repository shows how to:
|
4 |
+
1. Download a single GGUF quantized weight (`*Q2_K_L.gguf`) from Hugging Face by pasting your token into a file.
|
5 |
+
2. Run a small local Flask chat UI that talks to the model using `llama-cpp-python`.
|
6 |
+
|
7 |
+
## Files
|
8 |
+
- `download_model.py` — edit & paste your HF token, then run to download only the Q2_K_L gguf file.
|
9 |
+
- `app.py` — Flask server + model loader + chat endpoints.
|
10 |
+
- `templates/index.html` — Chat UI (ChatGPT-like).
|
11 |
+
- `requirements.txt` — Python dependencies.
|
12 |
+
|
13 |
+
## Requirements
|
14 |
+
- Python 3.10.9 (**recommend**)
|
15 |
+
- ~16 GB RAM (CPU-only); speed depends on quantization & CPU cores.
|
16 |
+
|
17 |
+
## Quick start
|
18 |
+
|
19 |
+
1. Create & activate a virtual environment:
|
20 |
+
```bash
|
21 |
+
python -m venv oss_env
|
22 |
+
# Windows
|
23 |
+
oss_env\Scripts\activate
|
24 |
+
# Linux / macOS
|
25 |
+
source oss_env/bin/activate
|
26 |
+
|
27 |
+
|
28 |
+
2. Install Python dependencies:
|
29 |
+
`pip install -r requirements.txt`
|
30 |
+
|
31 |
+
|
32 |
+
3. Edit download_model.py:
|
33 |
+
Paste your Hugging Face token into HUGGINGFACE_TOKEN.
|
34 |
+
If your model repo is different, update REPO_ID.
|
35 |
+
|
36 |
+
|
37 |
+
4. Download the Q2_K_L GGUF:
|
38 |
+
`python download_model.py`
|
39 |
+
The script will print the full path to the downloaded .gguf file.
|
40 |
+
|
41 |
+
|
42 |
+
5. (Optional) Edit app.py:
|
43 |
+
If you want to explicitly set the exact .gguf path, set MODEL_PATH at top of app.py.
|
44 |
+
Otherwise app.py will auto-detect the first .gguf under models/.
|
45 |
+
|
46 |
+
|
47 |
+
6. Run the Flask app:
|
48 |
+
`python app.py`
|
49 |
+
## Open http://localhost:5000
|
50 |
+
in your browser.
|
51 |
+
|
52 |
+
7. If need you can run the inference.py code for the single stage demo without chat loop
|
app.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, render_template, request, jsonify
|
2 |
+
from llama_cpp import Llama
|
3 |
+
import re
|
4 |
+
|
5 |
+
app = Flask(__name__)
|
6 |
+
|
7 |
+
# Path to the local GGUF model weights
|
8 |
+
MODEL_PATH = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf" # update this path
|
9 |
+
|
10 |
+
# Initialize model
|
11 |
+
llm = Llama(
|
12 |
+
model_path=MODEL_PATH,
|
13 |
+
n_ctx=2048,
|
14 |
+
n_threads=8 # adjust based on your CPU
|
15 |
+
)
|
16 |
+
|
17 |
+
# Build adaptive prompt
|
18 |
+
def build_prompt(history, user_text):
|
19 |
+
system_prompt = (
|
20 |
+
"You are a helpful and adaptive assistant. Follow these rules strictly:\n"
|
21 |
+
"- If the user asks a simple or factual question, give a short, precise answer.\n"
|
22 |
+
"- If the user requests a story, essay, or letter, provide a longer, well-structured response.\n"
|
23 |
+
"- If the user asks for programming help or code, provide correct, complete, well-formatted code.\n"
|
24 |
+
"- Always keep answers clear, neat, and structured; use points when helpful.\n"
|
25 |
+
"- Output code inside proper Markdown code blocks with language tags for syntax highlighting.\n"
|
26 |
+
)
|
27 |
+
prompt = system_prompt + "\n\n"
|
28 |
+
for turn in history:
|
29 |
+
prompt += f"User: {turn['user']}\nAssistant: {turn['assistant']}\n"
|
30 |
+
prompt += f"User: {user_text}\nAssistant:"
|
31 |
+
return prompt
|
32 |
+
|
33 |
+
@app.route("/")
|
34 |
+
def index():
|
35 |
+
return render_template("index.html")
|
36 |
+
|
37 |
+
@app.route("/chat", methods=["POST"])
|
38 |
+
def chat():
|
39 |
+
data = request.get_json()
|
40 |
+
user_message = data.get("message")
|
41 |
+
history = data.get("history", [])
|
42 |
+
|
43 |
+
prompt = build_prompt(history, user_message)
|
44 |
+
|
45 |
+
# Adjust max_tokens dynamically
|
46 |
+
if any(word in user_message.lower() for word in ["story", "letter", "essay"]):
|
47 |
+
max_out = 800
|
48 |
+
elif any(word in user_message.lower() for word in ["code", "program", "script", "python", "java", "html", "c++"]):
|
49 |
+
max_out = 1000
|
50 |
+
else:
|
51 |
+
max_out = 200
|
52 |
+
|
53 |
+
resp = llm(
|
54 |
+
prompt,
|
55 |
+
max_tokens=max_out,
|
56 |
+
temperature=0.7,
|
57 |
+
stop=["\nUser:", "\nAssistant:"]
|
58 |
+
)
|
59 |
+
|
60 |
+
text = resp["choices"][0]["text"].strip()
|
61 |
+
|
62 |
+
# Wrap fenced code blocks with copy button (handled in JS)
|
63 |
+
return jsonify({"response": text})
|
64 |
+
|
65 |
+
if __name__ == "__main__":
|
66 |
+
app.run(host="0.0.0.0", port=5000, debug=True)
|
download_model.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
download_model.py
|
3 |
+
- Paste your Hugging Face token into HUGGINGFACE_TOKEN below.
|
4 |
+
- By default it will try to download from REPO_ID and only files matching PATTERN.
|
5 |
+
- It prints the path of the downloaded .gguf file on success.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import glob
|
10 |
+
from huggingface_hub import login, snapshot_download
|
11 |
+
|
12 |
+
# ---- EDIT: paste your token here (or set HUGGINGFACE_TOKEN env var) ----
|
13 |
+
HUGGINGFACE_TOKEN = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
|
14 |
+
# -----------------------------------------------------------------------
|
15 |
+
|
16 |
+
# Replace with the repo that contains your GGUF files (change if needed)
|
17 |
+
REPO_ID = "unsloth/gpt-oss-20b-GGUF"
|
18 |
+
LOCAL_DIR = "models/oss_20b_gguf"
|
19 |
+
# Pattern to fetch only the Q2_K_L weight file:
|
20 |
+
PATTERN = "*Q2_K_L.gguf"
|
21 |
+
|
22 |
+
if not HUGGINGFACE_TOKEN or HUGGINGFACE_TOKEN.startswith("PASTE_"):
|
23 |
+
raise SystemExit("Please paste your Hugging Face token into HUGGINGFACE_TOKEN variable in this file.")
|
24 |
+
|
25 |
+
print("Logging in to Hugging Face hub...")
|
26 |
+
login(token=HUGGINGFACE_TOKEN)
|
27 |
+
|
28 |
+
print(f"Downloading from repo: {REPO_ID} --> local dir: {LOCAL_DIR}")
|
29 |
+
path = snapshot_download(
|
30 |
+
repo_id=REPO_ID,
|
31 |
+
local_dir=LOCAL_DIR,
|
32 |
+
token=HUGGINGFACE_TOKEN,
|
33 |
+
allow_patterns=[PATTERN],
|
34 |
+
resume_download=True,
|
35 |
+
)
|
36 |
+
|
37 |
+
# find the downloaded .gguf file
|
38 |
+
candidates = glob.glob(os.path.join(LOCAL_DIR, "**", "*.gguf"), recursive=True)
|
39 |
+
candidates = [c for c in candidates if "Q2_K_L" in os.path.basename(c)]
|
40 |
+
if not candidates:
|
41 |
+
raise SystemExit("Download finished but no Q2_K_L.gguf found in the target folder. Check REPO_ID or PATTERN.")
|
42 |
+
gguf_path = os.path.abspath(candidates[0])
|
43 |
+
|
44 |
+
print("Download complete.")
|
45 |
+
print("GGUF model path:", gguf_path)
|
46 |
+
print("\nSet MODEL_PATH in app.py to this path (or leave app.py to auto-detect 'models/**/*.gguf').")
|
inference.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from llama_cpp import Llama
|
2 |
+
|
3 |
+
# Path to the downloaded GGUF file (update this to match your folder and file)
|
4 |
+
model_path = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf"
|
5 |
+
|
6 |
+
# Load the model
|
7 |
+
llm = Llama(
|
8 |
+
model_path=model_path,
|
9 |
+
n_ctx=2048, # Context size
|
10 |
+
n_threads=8 # Adjust based on your CPU cores
|
11 |
+
)
|
12 |
+
|
13 |
+
# Generate text
|
14 |
+
prompt = "Who is the iron man"
|
15 |
+
output = llm(
|
16 |
+
prompt,
|
17 |
+
max_tokens=200,
|
18 |
+
temperature=0.7,
|
19 |
+
stop=["</s>"]
|
20 |
+
)
|
21 |
+
|
22 |
+
print(output["choices"][0]["text"])
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
flask>=2.2
|
2 |
+
llama-cpp-python>=0.1.70
|
3 |
+
huggingface_hub>=0.11.0
|
templates/index.html
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!doctype html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="utf-8" />
|
5 |
+
<meta name="viewport" content="width=device-width,initial-scale=1" />
|
6 |
+
<title>Local Chat (GGUF) — Chat</title>
|
7 |
+
<style>
|
8 |
+
body { font-family: Arial, sans-serif; background: #f9f9f9; }
|
9 |
+
.chat-container { width: 60%; margin: 40px auto; background: #fff; border-radius: 8px; padding: 20px; box-shadow: 0 0 10px rgba(0,0,0,0.1);}
|
10 |
+
.messages { max-height: 500px; overflow-y: auto; margin-bottom: 20px; }
|
11 |
+
.user { color: #0066cc; margin: 10px 0; }
|
12 |
+
.assistant { color: #333; margin: 10px 0; white-space: pre-wrap; }
|
13 |
+
textarea { width: 100%; height: 60px; padding: 10px; }
|
14 |
+
button { padding: 10px 20px; margin-top: 10px; cursor: pointer; }
|
15 |
+
pre code { background: #f4f4f4; display: block; padding: 10px; border-radius: 4px; }
|
16 |
+
.copy-btn { background: #0066cc; color: white; padding: 4px 8px; font-size: 12px; border: none; cursor: pointer; float: right; }
|
17 |
+
</style>
|
18 |
+
<script>
|
19 |
+
async function sendMessage() {
|
20 |
+
const message = document.getElementById("message").value.trim();
|
21 |
+
if (!message) return;
|
22 |
+
const chatBox = document.getElementById("messages");
|
23 |
+
chatBox.innerHTML += `<div class="user"><strong>You:</strong> ${message}</div>`;
|
24 |
+
document.getElementById("message").value = "";
|
25 |
+
|
26 |
+
const response = await fetch("/chat", {
|
27 |
+
method: "POST",
|
28 |
+
headers: { "Content-Type": "application/json" },
|
29 |
+
body: JSON.stringify({ message: message, history: collectHistory() })
|
30 |
+
});
|
31 |
+
const data = await response.json();
|
32 |
+
let text = data.response;
|
33 |
+
|
34 |
+
// Convert Markdown code blocks to HTML with copy button
|
35 |
+
text = text.replace(/```(.*?)\n([\s\S]*?)```/g, (match, lang, code) => {
|
36 |
+
const safeCode = code.replace(/</g, "<").replace(/>/g, ">");
|
37 |
+
return `<div><button class="copy-btn" onclick="copyCode(this)">Copy</button><pre><code class="${lang}">${safeCode}</code></pre></div>`;
|
38 |
+
});
|
39 |
+
|
40 |
+
chatBox.innerHTML += `<div class="assistant"><strong>Assistant:</strong> ${text}</div>`;
|
41 |
+
chatBox.scrollTop = chatBox.scrollHeight;
|
42 |
+
}
|
43 |
+
|
44 |
+
function collectHistory() {
|
45 |
+
const userEls = document.querySelectorAll(".user");
|
46 |
+
const assistantEls = document.querySelectorAll(".assistant");
|
47 |
+
let history = [];
|
48 |
+
for (let i = 0; i < userEls.length; i++) {
|
49 |
+
history.push({
|
50 |
+
user: userEls[i].innerText.replace("You: ", ""),
|
51 |
+
assistant: assistantEls[i]?.innerText.replace("Assistant: ", "") || ""
|
52 |
+
});
|
53 |
+
}
|
54 |
+
return history;
|
55 |
+
}
|
56 |
+
|
57 |
+
function copyCode(button) {
|
58 |
+
const code = button.nextElementSibling.innerText;
|
59 |
+
navigator.clipboard.writeText(code);
|
60 |
+
button.textContent = "Copied!";
|
61 |
+
setTimeout(() => (button.textContent = "Copy"), 2000);
|
62 |
+
}
|
63 |
+
</script>
|
64 |
+
</head>
|
65 |
+
<body>
|
66 |
+
<div class="chat-container">
|
67 |
+
<h2>Local Chat Assistant</h2>
|
68 |
+
<div id="messages" class="messages"></div>
|
69 |
+
<textarea id="message" placeholder="Type your message..."></textarea>
|
70 |
+
<button onclick="sendMessage()">Send</button>
|
71 |
+
</div>
|
72 |
+
</body>
|
73 |
+
</html>
|