# ---------------------------------------------------------------------- | |
# Granite-3.3-2B-Slerp – 40‑layer variant (v1.3‑40L) | |
# ---------------------------------------------------------------------- | |
# Goal: produce a stronger merged model when the underlying architecture | |
# has 40 transformer layers. | |
# ---------------------------------------------------------------------- | |
slices: | |
- sources: | |
- model: powermove72/granite-3.3-2b-Hermes3dataset | |
layer_range: [0, 40] # now 40 layers | |
- model: ibm-granite/granite-3.3-2b-instruct | |
layer_range: [0, 40] | |
merge_method: slerp | |
base_model: powermove72/granite-3.3-2b-Hermes3dataset | |
parameters: | |
t: | |
- filter: self_attn | |
value: | |
# Cosine‑annealed schedule (40 values) | |
- 1.000 | |
- 0.991 | |
- 0.967 | |
- 0.928 | |
- 0.876 | |
- 0.812 | |
- 0.739 | |
- 0.658 | |
- 0.572 | |
- 0.483 | |
- 0.393 | |
- 0.304 | |
- 0.218 | |
- 0.138 | |
- 0.067 | |
- 0.008 | |
- 0.000 | |
- 0.008 | |
- 0.067 | |
- 0.138 | |
- 0.218 | |
- 0.304 | |
- 0.393 | |
- 0.483 | |
- 0.572 | |
- 0.658 | |
- 0.739 | |
- 0.812 | |
- 0.876 | |
- 0.928 | |
- 0.967 | |
- 0.991 | |
- 1.000 | |
- 0.991 | |
- 0.967 | |
- 0.928 | |
- 0.876 | |
- 0.812 | |
- 0.739 | |
- 0.658 | |
- 0.572 | |
- 0.483 | |
- 0.393 | |
- 0.304 | |
- 0.218 | |
- 0.138 | |
- 0.067 | |
- 0.008 | |
- 0.000 | |
- filter: mlp | |
value: | |
# Complementary schedule (1 - self_attn) | |
- 0.000 | |
- 0.009 | |
- 0.033 | |
- 0.072 | |
- 0.124 | |
- 0.188 | |
- 0.261 | |
- 0.342 | |
- 0.428 | |
- 0.517 | |
- 0.607 | |
- 0.696 | |
- 0.782 | |
- 0.862 | |
- 0.933 | |
- 0.992 | |
- 1.000 | |
- 0.992 | |
- 0.933 | |
- 0.862 | |
- 0.782 | |
- 0.696 | |
- 0.607 | |
- 0.517 | |
- 0.428 | |
- 0.342 | |
- 0.261 | |
- 0.188 | |
- 0.124 | |
- 0.072 | |
- 0.033 | |
- 0.009 | |
- 0.000 | |
- 0.009 | |
- 0.033 | |
- 0.072 | |
- 0.124 | |
- 0.188 | |
- 0.261 | |
- 0.342 | |
- 0.428 | |
- 0.517 | |
- 0.607 | |
- 0.696 | |
- 0.782 | |
- 0.862 | |
- 0.933 | |
- 0.992 | |
- 1.000 | |
- value: 0.5 # Global fallback (unused when per‑filter defined) | |
dtype: bfloat16 | |
seed: 42 | |
deterministic: true | |
metadata: | |
model_name: Granite-3.3-2B-Slerp | |
version: v1.3-40L | |
date: 2025-08-15 | |
git_hash: c7e9a4f7 | |
notes: | | |
- Updated for 40 transformer layers. | |
- Cosine‑annealed per‑layer t vectors (self_attn & mlp) ensure smooth transition. | |
- Deterministic SLERP (seed=42) for reproducibility. | |
- Evaluation hook runs MMLU & HELM after merge. | |
- Optional: uncomment `t_amplitude: 0.6` to increase the contrast between the two models. | |
- Optional: add `post_merge: quantize` for inference‑only int8 deployment. | |
post_merge: | |
- name: eval_benchmarks | |
command: | | |
python -m eval.run --model NeuralPipe-7B-slerp --tasks mmlu,helm --precision bfloat16 --output results/2025-08-15-40L.json | |