A Coding Tutorial for Running PrismML Bonsai 1-Bit LLM on CUDA with GGUF, Benchmarking, Chat, JSON, and RAG

Thank you for reading this post, don't forget to subscribe!

section(“7 · Q1_0_g128 Quantization — What’s Happening Under the Hood”)

print(textwrap.dedent(“””
╔══════════════════════════════════════════════════════════════╗
║ Bonsai Q1_0_g128 Weight Representation ║
╠══════════════════════════════════════════════════════════════╣
║ Each weight = 1 bit: 0 → −scale ║
║ 1 → +scale ║
║ Every 128 weights share one FP16 scale factor. ║
║ ║
║ Effective bits per weight: ║
║ 1 bit (sign) + 16/128 bits (shared scale) = 1.125 bpw ║
║ ║
║ Memory comparison for Bonsai-1.7B: ║
║ FP16: 3.44 GB (1.0× baseline) ║
║ Q1_0_g128: 0.24 GB (14.2× smaller!) ║
║ MLX 1-bit g128: 0.27 GB (12.8× smaller) ║
╚══════════════════════════════════════════════════════════════╝
“””))

print(“📐 Python demo of Q1_0_g128 quantization logic:\n”)
import random
random.seed(42)
GROUP_SIZE = 128
weights_fp16 = [random.gauss(0, 0.1) for _ in range(GROUP_SIZE)]
scale = max(abs(w) for w in weights_fp16)
quantized = [1 if w >= 0 else 0 for w in weights_fp16]
dequantized = [scale if b == 1 else -scale for b in quantized]
mse = sum((a – b) ** 2 for a, b in zip(weights_fp16, dequantized)) / GROUP_SIZE

print(f” FP16 weights (first 8): {[f'{w:.4f}’ for w in weights_fp16[:8]]}”)
print(f” 1-bit repr (first 8): {quantized[:8]}”)
print(f” Shared scale: {scale:.4f}”)
print(f” Dequantized (first 8): {[f'{w:.4f}’ for w in dequantized[:8]]}”)
print(f” MSE of reconstruction: {mse:.6f}”)
memory_fp16 = GROUP_SIZE * 2
memory_1bit = GROUP_SIZE / 8 + 2
print(f”\n Memory: FP16={memory_fp16}B vs Q1_0_g128={memory_1bit:.1f}B ”
f”({memory_fp16/memory_1bit:.1f}× reduction)”)

section(“8 · Performance Benchmark — Tokens per Second”)

def benchmark(prompt, n_tokens=128, n_runs=3, **kw):
timings = []
for i in range(n_runs):
print(f” Run {i+1}/{n_runs} …”, end=” “, flush=True)
_, elapsed = infer(prompt, verbose=False, n_predict=n_tokens, **kw)
tps = n_tokens / elapsed
timings.append(tps)
print(f”{tps:.1f} tok/s”)
avg = sum(timings) / len(timings)
print(f”\n ✅ Average: {avg:.1f} tok/s (over {n_runs} runs, {n_tokens} tokens each)”)
return avg

print(“📊 Benchmarking Bonsai-1.7B on your GPU …”)
tps = benchmark(
“Explain the concept of neural network backpropagation step by step.”,
n_tokens=128, n_runs=3,
)

print(“\n Published reference throughputs (from whitepaper):”)
print(” ┌──────────────────────┬─────────┬──────────────┐”)
print(” │ Platform │ Backend │ TG128 tok/s │”)
print(” ├──────────────────────┼─────────┼──────────────┤”)
print(” │ RTX 4090 │ CUDA │ 674 │”)
print(” │ M4 Pro 48 GB │ Metal │ 250 │”)
print(f” │ Your GPU (measured) │ CUDA │ {tps:>7.1f} │”)
print(” └──────────────────────┴─────────┴──────────────┘”)

section(“9 · Multi-Turn Chat with Context Accumulation”)

def chat(user_msg, system=”You are a helpful assistant.”, history=None, **kw):
if history is None:
history = []
history.append((“user”, user_msg))
full = f”<|im_start|>system\n{system}<|im_end|>\n”
for role, msg in history:
full += f”<|im_start|>{role}\n{msg}<|im_end|>\n”
full += “<|im_start|>assistant\n”
safe = full.replace(‘”‘, ‘\\”‘).replace(‘\n’, ‘\\n’)
cmd = (
f'{LLAMA_CLI} -m “{MODEL_PATH}”‘
f’ -p “{safe}” -e’
f’ -n 200 –temp 0.5 –top-p 0.85 –top-k 20′
f’ -ngl 99 -c 4096 –no-display-prompt’
)
result = run(cmd, capture=True, check=False)
reply = result.stdout.strip()
history.append((“assistant”, reply))
return reply, history

print(“🗣 Starting a 3-turn conversation about 1-bit models …\n”)
history = []
turns = [
“What is a 1-bit language model?”,
“What are the main trade-offs compared to 4-bit or 8-bit quantization?”,
“How does Bonsai specifically address those trade-offs?”,
]
for i, msg in enumerate(turns, 1):
print(f”👤 Turn {i}: {msg}”)
reply, history = chat(msg, history=history)
print(f”🤖 Bonsai: {reply}\n”)
time.sleep(0.5)

section(“10 · Sampling Parameter Exploration”)

creative_prompt = “Write a one-sentence description of a futuristic city powered entirely by 1-bit AI.”
configs = [
(“Precise / Focused”, dict(temp=0.1, top_k=10, top_p=0.70)),
(“Balanced (default)”, dict(temp=0.5, top_k=20, top_p=0.85)),
(“Creative / Varied”, dict(temp=0.9, top_k=50, top_p=0.95)),
(“High entropy”, dict(temp=1.2, top_k=100, top_p=0.98)),
]

print(f’Prompt: “{creative_prompt}”\n’)
for label, params in configs:
out, _ = infer(creative_prompt, verbose=False, n_predict=80, **params)
print(f” [{label}]”)
print(f” temp={params[‘temp’]}, top_k={params[‘top_k’]}, top_p={params[‘top_p’]}”)
print(f” → {out[:200]}\n”)

Source link