# ---------------------------------------------------------------------- | |
# merge_weighted_average_40layers.yaml | |
# Slice‑wise weighted‑average merge for a 40‑layer LLM. | |
# – Different contribution per layer range. | |
# ---------------------------------------------------------------------- | |
merge_method: linear # merge type | |
# ---------------------------------------------------------------------- | |
# Global merge options | |
# ---------------------------------------------------------------------- | |
dtype: bfloat16 # preferred dtype on modern GPUs | |
parameters: | |
normalize: true # make each slice’s weights sum to 1.0 | |
low_cpu_mem_usage: true # stream weights, don’t load everything into RAM | |
seed: 2025 # reproducibility | |
deterministic: true # torch‑cudnn deterministic mode | |
# ---------------------------------------------------------------------- | |
# Metadata (helps with provenance & experiment tracking) | |
# ---------------------------------------------------------------------- | |
metadata: | |
model_name: Granite-3.3-2B-Avg-SliceWeighted | |
version: v1.0 | |
date: 2025-08-15 | |
notes: | | |
- 40‑layer model (indices 0‑39). | |
- Three slices: | |
* Layers 0‑13 → 80 % Llama‑2, 20 % Mistral | |
* Layers 14‑26 → 50 % each (mid‑point) | |
* Layers 27‑39 → 20 % Llama‑2, 80 % Mistral | |
- Normalised weights are enforced by `parameters.normalize`. | |
- Uses granite-3.3-2b-Hermes3dataset tokenizer for token‑id alignment. | |
# ---------------------------------------------------------------------- | |
# Tokenizer – both source models share the same one, so we can safely force it. | |
# ---------------------------------------------------------------------- | |
tokenizer_source: powermove72/granite-3.3-2b-Hermes3dataset | |
# ---------------------------------------------------------------------- | |
# Slice definitions (non‑overlapping, each covers a contiguous block of layers) | |
# ---------------------------------------------------------------------- | |
slices: | |
# -------------------------------------------------------------- | |
# Slice 1: Layers 0‑13 (the first 14 transformer blocks) | |
# -------------------------------------------------------------- | |
- sources: | |
- model: ibm-granite/granite-3.3-2b-instruct | |
layer_range: [0, 13] | |
parameters: | |
weight: 0.8 | |
- model: powermove72/granite-3.3-2b-Hermes3dataset | |
layer_range: [0, 13] | |
parameters: | |
weight: 0.2 | |
# -------------------------------------------------------------- | |
# Slice 2: Layers 14‑26 (the middle 13 transformer blocks) | |
# -------------------------------------------------------------- | |
- sources: | |
- model: ibm-granite/granite-3.3-2b-instruct | |
layer_range: [13, 26] | |
parameters: | |
weight: 0.5 # balanced | |
- model: powermove72/granite-3.3-2b-Hermes3dataset | |
layer_range: [13, 26] | |
parameters: | |
weight: 0.5 | |
# -------------------------------------------------------------- | |
# Slice 3: Layers 27‑39 (the last 14 transformer blocks) | |
# -------------------------------------------------------------- | |
- sources: | |
- model: ibm-granite/granite-3.3-2b-instruct | |
layer_range: [26, 40] | |
parameters: | |
weight: 0.2 | |
- model: powermove72/granite-3.3-2b-Hermes3dataset | |
layer_range: [26, 40] | |
parameters: | |
weight: 0.8 | |