Upload 6 files

Browse files

Files changed (6) hide show

README.md +52 -0
app.py +66 -0
download_model.py +46 -0
inference.py +22 -0
requirements.txt +3 -0
templates/index.html +73 -0

README.md ADDED Viewed

	@@ -0,0 +1,52 @@

+# Local GGUF Chat (Q2_K_L) — Run on CPU (16GB RAM)
+This repository shows how to:
+1. Download a single GGUF quantized weight (`*Q2_K_L.gguf`) from Hugging Face by pasting your token into a file.
+2. Run a small local Flask chat UI that talks to the model using `llama-cpp-python`.
+## Files
+- `download_model.py` — edit & paste your HF token, then run to download only the Q2_K_L gguf file.
+- `app.py` — Flask server + model loader + chat endpoints.
+- `templates/index.html` — Chat UI (ChatGPT-like).
+- `requirements.txt` — Python dependencies.
+## Requirements
+- Python 3.10.9 (**recommend**)
+- ~16 GB RAM (CPU-only); speed depends on quantization & CPU cores.
+## Quick start
+1. Create & activate a virtual environment:
+   ```bash
+   python -m venv oss_env
+   # Windows
+   oss_env\Scripts\activate
+   # Linux / macOS
+   source oss_env/bin/activate
+2. Install Python dependencies:
+`pip install -r requirements.txt`
+3. Edit download_model.py:
+Paste your Hugging Face token into HUGGINGFACE_TOKEN.
+If your model repo is different, update REPO_ID.
+4. Download the Q2_K_L GGUF:
+`python download_model.py`
+The script will print the full path to the downloaded .gguf file.
+5. (Optional) Edit app.py:
+If you want to explicitly set the exact .gguf path, set MODEL_PATH at top of app.py.
+Otherwise app.py will auto-detect the first .gguf under models/.
+6. Run the Flask app:
+`python app.py`
+## Open http://localhost:5000
+ in your browser.
+7. If need you can run the inference.py code for the single stage demo without chat loop

app.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from flask import Flask, render_template, request, jsonify
+from llama_cpp import Llama
+import re
+app = Flask(__name__)
+# Path to the local GGUF model weights
+MODEL_PATH = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf"  # update this path
+# Initialize model
+llm = Llama(
+    model_path=MODEL_PATH,
+    n_ctx=2048,
+    n_threads=8  # adjust based on your CPU
+)
+# Build adaptive prompt
+def build_prompt(history, user_text):
+    system_prompt = (
+        "You are a helpful and adaptive assistant. Follow these rules strictly:\n"
+        "- If the user asks a simple or factual question, give a short, precise answer.\n"
+        "- If the user requests a story, essay, or letter, provide a longer, well-structured response.\n"
+        "- If the user asks for programming help or code, provide correct, complete, well-formatted code.\n"
+        "- Always keep answers clear, neat, and structured; use points when helpful.\n"
+        "- Output code inside proper Markdown code blocks with language tags for syntax highlighting.\n"
+    )
+    prompt = system_prompt + "\n\n"
+    for turn in history:
+        prompt += f"User: {turn['user']}\nAssistant: {turn['assistant']}\n"
+    prompt += f"User: {user_text}\nAssistant:"
+    return prompt
+@app.route("/")
+def index():
+    return render_template("index.html")
+@app.route("/chat", methods=["POST"])
+def chat():
+    data = request.get_json()
+    user_message = data.get("message")
+    history = data.get("history", [])
+    prompt = build_prompt(history, user_message)
+    # Adjust max_tokens dynamically
+    if any(word in user_message.lower() for word in ["story", "letter", "essay"]):
+        max_out = 800
+    elif any(word in user_message.lower() for word in ["code", "program", "script", "python", "java", "html", "c++"]):
+        max_out = 1000
+    else:
+        max_out = 200
+    resp = llm(
+        prompt,
+        max_tokens=max_out,
+        temperature=0.7,
+        stop=["\nUser:", "\nAssistant:"]
+    )
+    text = resp["choices"][0]["text"].strip()
+    # Wrap fenced code blocks with copy button (handled in JS)
+    return jsonify({"response": text})
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=5000, debug=True)

download_model.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+download_model.py
+- Paste your Hugging Face token into HUGGINGFACE_TOKEN below.
+- By default it will try to download from REPO_ID and only files matching PATTERN.
+- It prints the path of the downloaded .gguf file on success.
+"""
+import os
+import glob
+from huggingface_hub import login, snapshot_download
+# ---- EDIT: paste your token here (or set HUGGINGFACE_TOKEN env var) ----
+HUGGINGFACE_TOKEN = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
+# -----------------------------------------------------------------------
+# Replace with the repo that contains your GGUF files (change if needed)
+REPO_ID = "unsloth/gpt-oss-20b-GGUF"
+LOCAL_DIR = "models/oss_20b_gguf"
+# Pattern to fetch only the Q2_K_L weight file:
+PATTERN = "*Q2_K_L.gguf"
+if not HUGGINGFACE_TOKEN or HUGGINGFACE_TOKEN.startswith("PASTE_"):
+    raise SystemExit("Please paste your Hugging Face token into HUGGINGFACE_TOKEN variable in this file.")
+print("Logging in to Hugging Face hub...")
+login(token=HUGGINGFACE_TOKEN)
+print(f"Downloading from repo: {REPO_ID}  --> local dir: {LOCAL_DIR}")
+path = snapshot_download(
+    repo_id=REPO_ID,
+    local_dir=LOCAL_DIR,
+    token=HUGGINGFACE_TOKEN,
+    allow_patterns=[PATTERN],
+    resume_download=True,
+)
+# find the downloaded .gguf file
+candidates = glob.glob(os.path.join(LOCAL_DIR, "**", "*.gguf"), recursive=True)
+candidates = [c for c in candidates if "Q2_K_L" in os.path.basename(c)]
+if not candidates:
+    raise SystemExit("Download finished but no Q2_K_L.gguf found in the target folder. Check REPO_ID or PATTERN.")
+gguf_path = os.path.abspath(candidates[0])
+print("Download complete.")
+print("GGUF model path:", gguf_path)
+print("\nSet MODEL_PATH in app.py to this path (or leave app.py to auto-detect 'models/**/*.gguf').")

inference.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from llama_cpp import Llama
+# Path to the downloaded GGUF file (update this to match your folder and file)
+model_path = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf"
+# Load the model
+llm = Llama(
+    model_path=model_path,
+    n_ctx=2048,        # Context size
+    n_threads=8        # Adjust based on your CPU cores
+)
+# Generate text
+prompt = "Who is the iron man"
+output = llm(
+    prompt,
+    max_tokens=200,
+    temperature=0.7,
+    stop=["</s>"]
+)
+print(output["choices"][0]["text"])

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+flask>=2.2
+llama-cpp-python>=0.1.70
+huggingface_hub>=0.11.0

templates/index.html ADDED Viewed

	@@ -0,0 +1,73 @@

+<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8" />
+  <meta name="viewport" content="width=device-width,initial-scale=1" />
+  <title>Local Chat (GGUF) — Chat</title>
+  <style>
+        body { font-family: Arial, sans-serif; background: #f9f9f9; }
+        .chat-container { width: 60%; margin: 40px auto; background: #fff; border-radius: 8px; padding: 20px; box-shadow: 0 0 10px rgba(0,0,0,0.1);}
+        .messages { max-height: 500px; overflow-y: auto; margin-bottom: 20px; }
+        .user { color: #0066cc; margin: 10px 0; }
+        .assistant { color: #333; margin: 10px 0; white-space: pre-wrap; }
+        textarea { width: 100%; height: 60px; padding: 10px; }
+        button { padding: 10px 20px; margin-top: 10px; cursor: pointer; }
+        pre code { background: #f4f4f4; display: block; padding: 10px; border-radius: 4px; }
+        .copy-btn { background: #0066cc; color: white; padding: 4px 8px; font-size: 12px; border: none; cursor: pointer; float: right; }
+    </style>
+    <script>
+        async function sendMessage() {
+            const message = document.getElementById("message").value.trim();
+            if (!message) return;
+            const chatBox = document.getElementById("messages");
+            chatBox.innerHTML += `<div class="user"><strong>You:</strong> ${message}</div>`;
+            document.getElementById("message").value = "";
+            const response = await fetch("/chat", {
+                method: "POST",
+                headers: { "Content-Type": "application/json" },
+                body: JSON.stringify({ message: message, history: collectHistory() })
+            });
+            const data = await response.json();
+            let text = data.response;
+            // Convert Markdown code blocks to HTML with copy button
+            text = text.replace(/```(.*?)\n([\s\S]*?)```/g, (match, lang, code) => {
+                const safeCode = code.replace(/</g, "&lt;").replace(/>/g, "&gt;");
+                return `<div><button class="copy-btn" onclick="copyCode(this)">Copy</button><pre><code class="${lang}">${safeCode}</code></pre></div>`;
+            });
+            chatBox.innerHTML += `<div class="assistant"><strong>Assistant:</strong> ${text}</div>`;
+            chatBox.scrollTop = chatBox.scrollHeight;
+        }
+        function collectHistory() {
+            const userEls = document.querySelectorAll(".user");
+            const assistantEls = document.querySelectorAll(".assistant");
+            let history = [];
+            for (let i = 0; i < userEls.length; i++) {
+                history.push({
+                    user: userEls[i].innerText.replace("You: ", ""),
+                    assistant: assistantEls[i]?.innerText.replace("Assistant: ", "") || ""
+                });
+            }
+            return history;
+        }
+        function copyCode(button) {
+            const code = button.nextElementSibling.innerText;
+            navigator.clipboard.writeText(code);
+            button.textContent = "Copied!";
+            setTimeout(() => (button.textContent = "Copy"), 2000);
+        }
+    </script>
+</head>
+<body>
+    <div class="chat-container">
+        <h2>Local Chat Assistant</h2>
+        <div id="messages" class="messages"></div>
+        <textarea id="message" placeholder="Type your message..."></textarea>
+        <button onclick="sendMessage()">Send</button>
+    </div>
+</body>
+</html>