from llama_cpp import Llama

# Path to the downloaded GGUF file (update this to match your folder and file)
model_path = "models/oss_20b_gguf/gpt-oss-20b-Q2_K_L.gguf"  

# Load the model
llm = Llama(
    model_path=model_path,
    n_ctx=2048,        # Context size
    n_threads=8        # Adjust based on your CPU cores
)

# Generate text
prompt = "Who is the iron man"
output = llm(
    prompt,
    max_tokens=200,
    temperature=0.7,
    stop=["</s>"]
)

print(output["choices"][0]["text"])