remiai3 commited on
Commit
46e7744
·
verified ·
1 Parent(s): a0df2a0

Upload 6 files

Browse files
Files changed (6) hide show
  1. README.md +52 -0
  2. app.py +66 -0
  3. download_model.py +46 -0
  4. inference.py +22 -0
  5. requirements.txt +3 -0
  6. templates/index.html +73 -0
README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local GGUF Chat (Q2_K_L) — Run on CPU (16GB RAM)
2
+
3
+ This repository shows how to:
4
+ 1. Download a single GGUF quantized weight (`*Q2_K_L.gguf`) from Hugging Face by pasting your token into a file.
5
+ 2. Run a small local Flask chat UI that talks to the model using `llama-cpp-python`.
6
+
7
+ ## Files
8
+ - `download_model.py` — edit & paste your HF token, then run to download only the Q2_K_L gguf file.
9
+ - `app.py` — Flask server + model loader + chat endpoints.
10
+ - `templates/index.html` — Chat UI (ChatGPT-like).
11
+ - `requirements.txt` — Python dependencies.
12
+
13
+ ## Requirements
14
+ - Python 3.10.9 (**recommend**)
15
+ - ~16 GB RAM (CPU-only); speed depends on quantization & CPU cores.
16
+
17
+ ## Quick start
18
+
19
+ 1. Create & activate a virtual environment:
20
+ ```bash
21
+ python -m venv oss_env
22
+ # Windows
23
+ oss_env\Scripts\activate
24
+ # Linux / macOS
25
+ source oss_env/bin/activate
26
+
27
+
28
+ 2. Install Python dependencies:
29
+ `pip install -r requirements.txt`
30
+
31
+
32
+ 3. Edit download_model.py:
33
+ Paste your Hugging Face token into HUGGINGFACE_TOKEN.
34
+ If your model repo is different, update REPO_ID.
35
+
36
+
37
+ 4. Download the Q2_K_L GGUF:
38
+ `python download_model.py`
39
+ The script will print the full path to the downloaded .gguf file.
40
+
41
+
42
+ 5. (Optional) Edit app.py:
43
+ If you want to explicitly set the exact .gguf path, set MODEL_PATH at top of app.py.
44
+ Otherwise app.py will auto-detect the first .gguf under models/.
45
+
46
+
47
+ 6. Run the Flask app:
48
+ `python app.py`
49
+ ## Open http://localhost:5000
50
+ in your browser.
51
+
52
+ 7. If need you can run the inference.py code for the single stage demo without chat loop
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request, jsonify
2
+ from llama_cpp import Llama
3
+ import re
4
+
5
+ app = Flask(__name__)
6
+
7
+ # Path to the local GGUF model weights
8
+ MODEL_PATH = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf" # update this path
9
+
10
+ # Initialize model
11
+ llm = Llama(
12
+ model_path=MODEL_PATH,
13
+ n_ctx=2048,
14
+ n_threads=8 # adjust based on your CPU
15
+ )
16
+
17
+ # Build adaptive prompt
18
+ def build_prompt(history, user_text):
19
+ system_prompt = (
20
+ "You are a helpful and adaptive assistant. Follow these rules strictly:\n"
21
+ "- If the user asks a simple or factual question, give a short, precise answer.\n"
22
+ "- If the user requests a story, essay, or letter, provide a longer, well-structured response.\n"
23
+ "- If the user asks for programming help or code, provide correct, complete, well-formatted code.\n"
24
+ "- Always keep answers clear, neat, and structured; use points when helpful.\n"
25
+ "- Output code inside proper Markdown code blocks with language tags for syntax highlighting.\n"
26
+ )
27
+ prompt = system_prompt + "\n\n"
28
+ for turn in history:
29
+ prompt += f"User: {turn['user']}\nAssistant: {turn['assistant']}\n"
30
+ prompt += f"User: {user_text}\nAssistant:"
31
+ return prompt
32
+
33
+ @app.route("/")
34
+ def index():
35
+ return render_template("index.html")
36
+
37
+ @app.route("/chat", methods=["POST"])
38
+ def chat():
39
+ data = request.get_json()
40
+ user_message = data.get("message")
41
+ history = data.get("history", [])
42
+
43
+ prompt = build_prompt(history, user_message)
44
+
45
+ # Adjust max_tokens dynamically
46
+ if any(word in user_message.lower() for word in ["story", "letter", "essay"]):
47
+ max_out = 800
48
+ elif any(word in user_message.lower() for word in ["code", "program", "script", "python", "java", "html", "c++"]):
49
+ max_out = 1000
50
+ else:
51
+ max_out = 200
52
+
53
+ resp = llm(
54
+ prompt,
55
+ max_tokens=max_out,
56
+ temperature=0.7,
57
+ stop=["\nUser:", "\nAssistant:"]
58
+ )
59
+
60
+ text = resp["choices"][0]["text"].strip()
61
+
62
+ # Wrap fenced code blocks with copy button (handled in JS)
63
+ return jsonify({"response": text})
64
+
65
+ if __name__ == "__main__":
66
+ app.run(host="0.0.0.0", port=5000, debug=True)
download_model.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ download_model.py
3
+ - Paste your Hugging Face token into HUGGINGFACE_TOKEN below.
4
+ - By default it will try to download from REPO_ID and only files matching PATTERN.
5
+ - It prints the path of the downloaded .gguf file on success.
6
+ """
7
+
8
+ import os
9
+ import glob
10
+ from huggingface_hub import login, snapshot_download
11
+
12
+ # ---- EDIT: paste your token here (or set HUGGINGFACE_TOKEN env var) ----
13
+ HUGGINGFACE_TOKEN = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
14
+ # -----------------------------------------------------------------------
15
+
16
+ # Replace with the repo that contains your GGUF files (change if needed)
17
+ REPO_ID = "unsloth/gpt-oss-20b-GGUF"
18
+ LOCAL_DIR = "models/oss_20b_gguf"
19
+ # Pattern to fetch only the Q2_K_L weight file:
20
+ PATTERN = "*Q2_K_L.gguf"
21
+
22
+ if not HUGGINGFACE_TOKEN or HUGGINGFACE_TOKEN.startswith("PASTE_"):
23
+ raise SystemExit("Please paste your Hugging Face token into HUGGINGFACE_TOKEN variable in this file.")
24
+
25
+ print("Logging in to Hugging Face hub...")
26
+ login(token=HUGGINGFACE_TOKEN)
27
+
28
+ print(f"Downloading from repo: {REPO_ID} --> local dir: {LOCAL_DIR}")
29
+ path = snapshot_download(
30
+ repo_id=REPO_ID,
31
+ local_dir=LOCAL_DIR,
32
+ token=HUGGINGFACE_TOKEN,
33
+ allow_patterns=[PATTERN],
34
+ resume_download=True,
35
+ )
36
+
37
+ # find the downloaded .gguf file
38
+ candidates = glob.glob(os.path.join(LOCAL_DIR, "**", "*.gguf"), recursive=True)
39
+ candidates = [c for c in candidates if "Q2_K_L" in os.path.basename(c)]
40
+ if not candidates:
41
+ raise SystemExit("Download finished but no Q2_K_L.gguf found in the target folder. Check REPO_ID or PATTERN.")
42
+ gguf_path = os.path.abspath(candidates[0])
43
+
44
+ print("Download complete.")
45
+ print("GGUF model path:", gguf_path)
46
+ print("\nSet MODEL_PATH in app.py to this path (or leave app.py to auto-detect 'models/**/*.gguf').")
inference.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_cpp import Llama
2
+
3
+ # Path to the downloaded GGUF file (update this to match your folder and file)
4
+ model_path = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf"
5
+
6
+ # Load the model
7
+ llm = Llama(
8
+ model_path=model_path,
9
+ n_ctx=2048, # Context size
10
+ n_threads=8 # Adjust based on your CPU cores
11
+ )
12
+
13
+ # Generate text
14
+ prompt = "Who is the iron man"
15
+ output = llm(
16
+ prompt,
17
+ max_tokens=200,
18
+ temperature=0.7,
19
+ stop=["</s>"]
20
+ )
21
+
22
+ print(output["choices"][0]["text"])
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ flask>=2.2
2
+ llama-cpp-python>=0.1.70
3
+ huggingface_hub>=0.11.0
templates/index.html ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width,initial-scale=1" />
6
+ <title>Local Chat (GGUF) — Chat</title>
7
+ <style>
8
+ body { font-family: Arial, sans-serif; background: #f9f9f9; }
9
+ .chat-container { width: 60%; margin: 40px auto; background: #fff; border-radius: 8px; padding: 20px; box-shadow: 0 0 10px rgba(0,0,0,0.1);}
10
+ .messages { max-height: 500px; overflow-y: auto; margin-bottom: 20px; }
11
+ .user { color: #0066cc; margin: 10px 0; }
12
+ .assistant { color: #333; margin: 10px 0; white-space: pre-wrap; }
13
+ textarea { width: 100%; height: 60px; padding: 10px; }
14
+ button { padding: 10px 20px; margin-top: 10px; cursor: pointer; }
15
+ pre code { background: #f4f4f4; display: block; padding: 10px; border-radius: 4px; }
16
+ .copy-btn { background: #0066cc; color: white; padding: 4px 8px; font-size: 12px; border: none; cursor: pointer; float: right; }
17
+ </style>
18
+ <script>
19
+ async function sendMessage() {
20
+ const message = document.getElementById("message").value.trim();
21
+ if (!message) return;
22
+ const chatBox = document.getElementById("messages");
23
+ chatBox.innerHTML += `<div class="user"><strong>You:</strong> ${message}</div>`;
24
+ document.getElementById("message").value = "";
25
+
26
+ const response = await fetch("/chat", {
27
+ method: "POST",
28
+ headers: { "Content-Type": "application/json" },
29
+ body: JSON.stringify({ message: message, history: collectHistory() })
30
+ });
31
+ const data = await response.json();
32
+ let text = data.response;
33
+
34
+ // Convert Markdown code blocks to HTML with copy button
35
+ text = text.replace(/```(.*?)\n([\s\S]*?)```/g, (match, lang, code) => {
36
+ const safeCode = code.replace(/</g, "&lt;").replace(/>/g, "&gt;");
37
+ return `<div><button class="copy-btn" onclick="copyCode(this)">Copy</button><pre><code class="${lang}">${safeCode}</code></pre></div>`;
38
+ });
39
+
40
+ chatBox.innerHTML += `<div class="assistant"><strong>Assistant:</strong> ${text}</div>`;
41
+ chatBox.scrollTop = chatBox.scrollHeight;
42
+ }
43
+
44
+ function collectHistory() {
45
+ const userEls = document.querySelectorAll(".user");
46
+ const assistantEls = document.querySelectorAll(".assistant");
47
+ let history = [];
48
+ for (let i = 0; i < userEls.length; i++) {
49
+ history.push({
50
+ user: userEls[i].innerText.replace("You: ", ""),
51
+ assistant: assistantEls[i]?.innerText.replace("Assistant: ", "") || ""
52
+ });
53
+ }
54
+ return history;
55
+ }
56
+
57
+ function copyCode(button) {
58
+ const code = button.nextElementSibling.innerText;
59
+ navigator.clipboard.writeText(code);
60
+ button.textContent = "Copied!";
61
+ setTimeout(() => (button.textContent = "Copy"), 2000);
62
+ }
63
+ </script>
64
+ </head>
65
+ <body>
66
+ <div class="chat-container">
67
+ <h2>Local Chat Assistant</h2>
68
+ <div id="messages" class="messages"></div>
69
+ <textarea id="message" placeholder="Type your message..."></textarea>
70
+ <button onclick="sendMessage()">Send</button>
71
+ </div>
72
+ </body>
73
+ </html>