Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
@@ -116,33 +116,30 @@ We use [`llama-quantize`](./quantize.sh) with `imatrix` to quantize models from
|
|
116 |
|
117 |
Here's the speed and quality evaluation on two nano benchmarks. The higher the better. `IQ3_S` seems to be a good balance between size and speed.
|
118 |
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
|
127 |
-
|
128 |
-
|
|
129 |
-
|
|
130 |
-
|
|
131 |
-
|
|
132 |
-
|
|
133 |
-
|
|
134 |
-
|
|
135 |
-
|
|
136 |
-
|
|
137 |
-
|
|
138 |
-
|
|
139 |
-
|
|
140 |
-
|
|
141 |
-
|
|
142 |
-
|
|
143 |
-
| Q6_K | 2.36 GiB | 6.56 | 5315MB | 3334 | -2% |
|
144 |
-
| Q8_0 | 3.05 GiB | 8.50 | 6027MB | 3767 | +11% |
|
145 |
-
| F16 | 5.75 GiB | 16.00 | 9939MB | 3399 | +0% |
|
146 |
|
147 |
|
148 |
System info:
|
@@ -161,7 +158,7 @@ llama_context: n_ctx_per_seq = 4096
|
|
161 |
llama_context: n_batch = 4096
|
162 |
llama_context: n_ubatch = 4096
|
163 |
llama_context: causal_attn = 1
|
164 |
-
llama_context: flash_attn = 1
|
165 |
llama_context: kv_unified = true
|
166 |
llama_context: freq_base = 1000000.0
|
167 |
llama_context: freq_scale = 1
|
@@ -210,7 +207,7 @@ main: number of embeddings = 5090
|
|
210 |
| Q6_K | 0.7951 | 0.5636 | 0.4822 | 0.4337 | 0.7846 | +8% | +0% | +10% | +1% | +5% | -0% | +7% | -0% | +0% | -1% |
|
211 |
| Q8_0 | 0.7938 | 0.5687 | 0.4784 | 0.4335 | 0.7851 | +7% | +0% | +11% | +2% | +4% | -1% | +7% | -0% | +0% | -1% |
|
212 |
| F16 | 0.7940 | 0.5610 | 0.4931 | 0.4343 | 0.7963 | +7% | +0% | +9% | +1% | +7% | +2% | +7% | -0% | +2% | +0% |
|
213 |
-
|
|
214 |
-
|
|
215 |
|
216 |
|
|
|
116 |
|
117 |
Here's the speed and quality evaluation on two nano benchmarks. The higher the better. `IQ3_S` seems to be a good balance between size and speed.
|
118 |
|
119 |
+
#### Table 1: Tokens per Second on NanoHotpotQA `Documents`
|
120 |
+
|
121 |
+
| Quantization | File Size | BPW | Peak VRAM | Token/s w/ FA | Token/s w/o FA |
|
122 |
+
|------------------|-----------|-----|-----------|--------------|----------------|
|
123 |
+
| IQ1_S | 748.77 MiB | 2.04 | 4137MB | 3625 | 2050 |
|
124 |
+
| IQ1_M | 804.97 MiB | 2.19 | 4193MB | 3349 | 1997 |
|
125 |
+
| IQ2_XXS | 898.64 MiB | 2.44 | 4287MB | 3701 | 2071 |
|
126 |
+
| IQ2_M | 1.06 GiB | 2.94 | 4471MB | 3407 | 1989 |
|
127 |
+
| Q2_K | 1.18 GiB | 3.29 | 4599MB | 3173 | 1905 |
|
128 |
+
| IQ3_XXS | 1.19 GiB | 3.31 | 4605MB | 3668 | 2067 |
|
129 |
+
| IQ3_XS | 1.29 GiB | 3.59 | 4709MB | 3604 | 2053 |
|
130 |
+
| IQ3_S | 1.35 GiB | 3.76 | 4771MB | 3599 | 2049 |
|
131 |
+
| IQ3_M | 1.38 GiB | 3.84 | 4803MB | 3603 | 2053 |
|
132 |
+
| Q3_K_M | 1.48 GiB | 4.11 | 4899MB | 3450 | 2008 |
|
133 |
+
| IQ4_NL | 1.69 GiB | 4.72 | 5123MB | 3571 | 2039 |
|
134 |
+
| IQ4_XS | 1.61 GiB | 4.49 | 5041MB | 3585 | 2046 |
|
135 |
+
| Q4_K_M | 1.79 GiB | 4.99 | 5223MB | 3558 | 2045 |
|
136 |
+
| Q5_K_S | 2.02 GiB | 5.61 | 5451MB | 3567 | 2044 |
|
137 |
+
| Q5_K_M | 2.07 GiB | 5.75 | 5505MB | 3528 | 2034 |
|
138 |
+
| Q6_K | 2.36 GiB | 6.56 | 5801MB | 3334 | 1981 |
|
139 |
+
| Q8_0 | 3.05 GiB | 8.50 | 6513MB | 3767 | 2101 |
|
140 |
+
| F16 | 5.75 GiB | 16.00 | 9929MB | 3399 | 2023 |
|
141 |
+
| v3 (Transformers) | 1.10 GiB | 16.00 | 2887MB | | 16505 |
|
142 |
+
| v4 (Transformers) | 7.40 GiB | 16.00 | 14795MB | | 1865 |
|
|
|
|
|
|
|
143 |
|
144 |
|
145 |
System info:
|
|
|
158 |
llama_context: n_batch = 4096
|
159 |
llama_context: n_ubatch = 4096
|
160 |
llama_context: causal_attn = 1
|
161 |
+
llama_context: flash_attn = 1 // 1 for w/ FA in the table; 0 for w/o FA
|
162 |
llama_context: kv_unified = true
|
163 |
llama_context: freq_base = 1000000.0
|
164 |
llama_context: freq_scale = 1
|
|
|
207 |
| Q6_K | 0.7951 | 0.5636 | 0.4822 | 0.4337 | 0.7846 | +8% | +0% | +10% | +1% | +5% | -0% | +7% | -0% | +0% | -1% |
|
208 |
| Q8_0 | 0.7938 | 0.5687 | 0.4784 | 0.4335 | 0.7851 | +7% | +0% | +11% | +2% | +4% | -1% | +7% | -0% | +0% | -1% |
|
209 |
| F16 | 0.7940 | 0.5610 | 0.4931 | 0.4343 | 0.7963 | +7% | +0% | +9% | +1% | +7% | +2% | +7% | -0% | +2% | +0% |
|
210 |
+
| v3 (Transformers) | 0.7393 | 0.5144 | 0.4600 | 0.4068 | 0.7820 | +0% | -7% | +0% | -8% | +0% | -5% | +0% | -6% | +0% | -2% |
|
211 |
+
| v4 (Transformers) | 0.7977 | 0.5571 | 0.4844 | 0.4351 | 0.7963 | +8% | +0% | +8% | +0% | +5% | +0% | +7% | +0% | +2% | +0% |
|
212 |
|
213 |
|