diff --git a/llm-q-scaling-law-master/.gitignore b/llm-q-scaling-law-master/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..255e379c54060bdb5080ea36fac97de62d0ffed9 --- /dev/null +++ b/llm-q-scaling-law-master/.gitignore @@ -0,0 +1,7 @@ +.vscode/ +__pycache__/ +checkpoints/ +wandb/ +ckpts/ +ckpt/ +results/ \ No newline at end of file diff --git a/llm-q-scaling-law-master/.gitmodules b/llm-q-scaling-law-master/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..7535238d239310f6d9aa1aff284f727c8f9dca69 --- /dev/null +++ b/llm-q-scaling-law-master/.gitmodules @@ -0,0 +1,3 @@ +[submodule "src/lm-evaluation-harness"] + path = src/lm-evaluation-harness + url = git@github.com:EleutherAI/lm-evaluation-harness.git diff --git a/llm-q-scaling-law-master/README.md b/llm-q-scaling-law-master/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b06585d9018a7ed159708c7d689467e0565d4b12 --- /dev/null +++ b/llm-q-scaling-law-master/README.md @@ -0,0 +1,23 @@ +# LLM Quantisation Scaling Law + +## Setup + +Conda environment is recommended. To create a conda environment, run: + +```bash +conda create -n llm-mixed-q python=3.11 -y +pip install -r requirements.txt +git submodule update --init --recursive +``` + +## Features + +* Supported model architectures: + + Qwen2 + + OPT + + Llama + + +## Entry points + +The configuration file for model and search parameters is located in `./config/`. diff --git a/llm-q-scaling-law-master/configs/debug/integer.toml b/llm-q-scaling-law-master/configs/debug/integer.toml new file mode 100644 index 0000000000000000000000000000000000000000..934d1ad0bd8fd1fc9cadd820365547fcc6a7ab54 --- /dev/null +++ b/llm-q-scaling-law-master/configs/debug/integer.toml @@ -0,0 +1,19 @@ +[quantization] +name = "integer" +data_in_width = 16 +data_in_frac_width = 3 +weight_width = 16 +weight_frac_width = 3 +bias_width = 16 +bias_frac_width = 3 + +[setup] +# low to high precision ratio +ratio = 0.1 +# at what granularity? +# select from ["transformer_layer", "matmult"] +granularity = "transformer_layer" +tasks = ['sst', 'mnli'] +batch_size = 16 +num_samples_per_trial = 1024 +num_trials = 16 diff --git a/llm-q-scaling-law-master/configs/debug/mxint_4bit-bypass.toml b/llm-q-scaling-law-master/configs/debug/mxint_4bit-bypass.toml new file mode 100644 index 0000000000000000000000000000000000000000..9f812183143e727a9944e41c111660239a7cb3ff --- /dev/null +++ b/llm-q-scaling-law-master/configs/debug/mxint_4bit-bypass.toml @@ -0,0 +1,36 @@ +[quantization.linear.x] + name="bypass" + width=8 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="bypass" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="bypass" + width=8 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="bypass" + width=8 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.9 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="transformer_layer" + # granularity="matmult" + tasks=['custom_alpaca'] + batch_size=8 + num_samples_per_trial=10 + num_trials=3 + device_map="auto-balanced" + random=true diff --git a/llm-q-scaling-law-master/configs/debug/mxint_4bit-no_evaluate.toml b/llm-q-scaling-law-master/configs/debug/mxint_4bit-no_evaluate.toml new file mode 100644 index 0000000000000000000000000000000000000000..8e34fecc258395ce3bffa7d8cb28e585ffc4f4c7 --- /dev/null +++ b/llm-q-scaling-law-master/configs/debug/mxint_4bit-no_evaluate.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.9 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="transformer_layer" + # granularity="matmult" + tasks=['custom_alpaca'] + batch_size=4 + num_samples_per_trial=1000 + num_trials=3 + device_map="auto-balanced" + random=true + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=[] + batch_size=1 + num_best_epochs=1 diff --git a/llm-q-scaling-law-master/configs/debug/mxint_4bit-tinyllama.toml b/llm-q-scaling-law-master/configs/debug/mxint_4bit-tinyllama.toml new file mode 100644 index 0000000000000000000000000000000000000000..6c0e5268ac3d633c5dbbdc7c5735cbaf181e4edd --- /dev/null +++ b/llm-q-scaling-law-master/configs/debug/mxint_4bit-tinyllama.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.9 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="transformer_layer" + # granularity="matmult" + tasks=['custom_alpaca'] + batch_size=8 + num_samples_per_trial=100 + num_trials=3 + device_map="auto-balanced" + random=true + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=8 + num_best_epochs=1 diff --git a/llm-q-scaling-law-master/configs/search/layerwise/custom-eval.toml b/llm-q-scaling-law-master/configs/search/layerwise/custom-eval.toml new file mode 100644 index 0000000000000000000000000000000000000000..2c44cc112b5f0d135d4611614b848c20172a2a27 --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/layerwise/custom-eval.toml @@ -0,0 +1,38 @@ +[quantization.linear.x] # this is E2M1 + name="minifloat" + width=4 + exponent_width=2 +[quantization.linear.w] + name="minifloat" + width=4 + exponent_width=2 + +[quantization.matmul.x] + name="minifloat" + width=4 + exponent_width=2 + +[quantization.matmul.w] + name="minifloat" + width=4 + exponent_width=2 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="transformer_layer" + # granularity="matmult" + tasks=['custom_pajama'] + batch_size=8 + num_samples_per_trial=1000 + num_trials=4 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['scaling_law_easy', 'scaling_law_hard'] + batch_size="auto:8" + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml new file mode 100644 index 0000000000000000000000000000000000000000..cf3b153eb11ed867fbf3e3ffa54c481b895232b1 --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="transformer_layer" + # granularity="matmult" + tasks=['custom_alpaca'] + batch_size=1 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=1 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml new file mode 100644 index 0000000000000000000000000000000000000000..e13e5aadec44bb697ba59e580711ace624ad2d26 --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="transformer_layer" + # granularity="matmult" + tasks=['custom_alpaca'] + batch_size=2 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=2 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-4.toml b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-4.toml new file mode 100644 index 0000000000000000000000000000000000000000..9c388d977735628839c2e4640289b1621910b888 --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-4.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="transformer_layer" + # granularity="matmult" + tasks=['custom_alpaca'] + batch_size=4 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=4 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml new file mode 100644 index 0000000000000000000000000000000000000000..009b66d3d7dff0880c2a44f6c95155ad4349413c --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="transformer_layer" + # granularity="matmult" + tasks=['custom_alpaca'] + batch_size=8 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=8 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml new file mode 100644 index 0000000000000000000000000000000000000000..a46c918a55e480c0df42a8041153e7a4766cf252 --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="transformer_layer" + # granularity="matmult" + tasks=['custom_pajama'] + batch_size=1 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=1 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml new file mode 100644 index 0000000000000000000000000000000000000000..80cf8ee69849b2aae2aaf6e18d7f468c8c6d6bd3 --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="transformer_layer" + # granularity="matmult" + tasks=['custom_pajama'] + batch_size=2 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=2 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml new file mode 100644 index 0000000000000000000000000000000000000000..c5ac2ef5b28a40f8dff5c415480e7f39f2a073e4 --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="transformer_layer" + # granularity="matmult" + tasks=['custom_pajama'] + batch_size=4 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=4 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml new file mode 100644 index 0000000000000000000000000000000000000000..9d59c67cbe70f9070aaf61e39561e5c37257275e --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="transformer_layer" + # granularity="matmult" + tasks=['custom_pajama'] + batch_size=8 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=8 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-alpaca-random-50-bs-1.toml b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-alpaca-random-50-bs-1.toml new file mode 100644 index 0000000000000000000000000000000000000000..98d93db257e33d3b03e18719f7483ac786668ca3 --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-alpaca-random-50-bs-1.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="matmult" + # granularity="matmult" + tasks=['custom_alpaca'] + batch_size=1 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=1 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-alpaca-random-50-bs-2.toml b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-alpaca-random-50-bs-2.toml new file mode 100644 index 0000000000000000000000000000000000000000..e6b4db1cfee2cca4b772f073d83cc270902b1776 --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-alpaca-random-50-bs-2.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="matmult" + # granularity="matmult" + tasks=['custom_alpaca'] + batch_size=2 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=1 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-alpaca-random-50-bs-4.toml b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-alpaca-random-50-bs-4.toml new file mode 100644 index 0000000000000000000000000000000000000000..95791d843e1489e316fe79e756c9c6d12e89292c --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-alpaca-random-50-bs-4.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="matmult" + # granularity="matmult" + tasks=['custom_alpaca'] + batch_size=4 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=1 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-alpaca-random-50-bs-8.toml b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-alpaca-random-50-bs-8.toml new file mode 100644 index 0000000000000000000000000000000000000000..5d8a42e50bc0136ef036c77123a6a84eab577ea2 --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-alpaca-random-50-bs-8.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="matmult" + # granularity="matmult" + tasks=['custom_alpaca'] + batch_size=8 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=1 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml new file mode 100644 index 0000000000000000000000000000000000000000..295f524b473c8d4fdb7ef00921031d5989fadfd7 --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="matmult" + # granularity="matmult" + tasks=['custom_pajama'] + batch_size=1 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=1 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml new file mode 100644 index 0000000000000000000000000000000000000000..55200b51c3a0959081a298e836c174a0a114d909 --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="matmult" + # granularity="matmult" + tasks=['custom_pajama'] + batch_size=2 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=2 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml new file mode 100644 index 0000000000000000000000000000000000000000..141eb88e5b895e3d9792d003d162aaab1de7c44f --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="matmult" + # granularity="matmult" + tasks=['custom_pajama'] + batch_size=4 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=4 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml new file mode 100644 index 0000000000000000000000000000000000000000..f373df8f50f76842c65112fb4dd9bb6f733c6cad --- /dev/null +++ b/llm-q-scaling-law-master/configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml @@ -0,0 +1,42 @@ +[quantization.linear.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 +[quantization.linear.w] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.x] + name="mxint" + width=4 + block_size=16 + block_axis=-2 + +[quantization.matmul.w] + name="mxint" + width=4 + block_size=16 + block_axis=-1 + +[setup] + # low to high precision ratio + ratio=0.0 + # at what granularity? + # select from ["transformer_layer", "matmult"] + granularity="matmult" + # granularity="matmult" + tasks=['custom_pajama'] + batch_size=8 + num_samples_per_trial=1000 + num_trials=50 + device_map="auto-balanced" + random=false + +[evaluation] + # set to [] if not using any evaluation benchmarks + tasks=['mmlu'] + batch_size=8 + num_best_epochs=3 diff --git a/llm-q-scaling-law-master/eval_scripts/eval_harness_commands.txt b/llm-q-scaling-law-master/eval_scripts/eval_harness_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac4ac14d435e00e5bddd8816d337b4cd209dc595 --- /dev/null +++ b/llm-q-scaling-law-master/eval_scripts/eval_harness_commands.txt @@ -0,0 +1,34 @@ +accelerate launch -m lm_eval --model hf \ + --tasks mmlu \ + --model_args pretrained=Qwen/Qwen1.5-0.5B \ + --batch_size 8 + +accelerate launch -m lm_eval --model hf \ + --tasks mmlu \ + --model_args pretrained=Qwen/Qwen1.5-1.8B \ + --batch_size 8 + +accelerate launch -m lm_eval --model hf \ + --tasks mmlu \ + --model_args pretrained=Qwen/Qwen1.5-4B \ + --batch_size 8 + +accelerate launch -m lm_eval --model hf \ + --tasks mmlu \ + --model_args pretrained=Qwen/Qwen1.5-7B \ + --batch_size 4 + +lm_eval --model hf \ + --tasks mmlu \ + --model_args pretrained=Qwen/Qwen1.5-14B,parallelize=True \ + --batch_size 8 + +lm_eval --model hf \ + --tasks mmlu \ + --model_args pretrained=Qwen/Qwen1.5-32B,parallelize=True \ + --batch_size 4 + +lm_eval --model hf \ + --tasks mmlu \ + --model_args pretrained=Qwen/Qwen1.5-72B,parallelize=True \ + --batch_size 1 \ No newline at end of file diff --git a/llm-q-scaling-law-master/eval_scripts/layerwise/evalscript_generator.py b/llm-q-scaling-law-master/eval_scripts/layerwise/evalscript_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..a25860fa5de7c17e819be111d6410ac3c288ecfb --- /dev/null +++ b/llm-q-scaling-law-master/eval_scripts/layerwise/evalscript_generator.py @@ -0,0 +1,141 @@ +""" +This generates the runscripts for the LLM-Q scaling law simulations. + +""" + +import os +import argparse + +# the search command is +#python src/main.py --model_name {model_name} --search_config configs/search/mxint_4bit-tinyllama.toml --model_parallel --disable_wandb --save_dir ../ckpt/test +# the save dir is +# results/search/layerwise/{model_name}_{ratio} + +granularity = ["transformer_layer"] + +# opt template for searching + +opt_model_sizes=["opt-125m","opt-350m","opt-1.3b","opt-2.7b","opt-6.7b","opt-13b","opt-30b","opt-66b"] +opt_batch_size = [8,8,8,8,4,2,1,1] +opt_layer_wise_ratios = [0.0,0.5,0.6,0.7,0.8,0.9,0.95,0.975] +opt_layer_wise_search_config_template = "mxint_4bit-pajama-random-50-bs-{}.toml" +opt_layer_wise_search_dir_template = "eval/layerwise/{}_{}" + +opt_layerwise_run_command_palette = "python src/full_eval.py --model_arch opt --model_name {model_name} --ratio {q_ratio} --eval_config configs/search/layerwise/{search_config} --model_parallel --quantized_list \"{quantized_list}\"" + +with open("opt_layerwise_eval_commands.txt", "w") as f: + for i in range(len(opt_model_sizes)): + + eval_best_trail_file = "../../q_ratio_results/layerwise/opt/opt-{}-best-trail.txt".format(opt_model_sizes[i].split("-")[1]) + try: + with open(eval_best_trail_file, "r") as fi: + # read each line + lines = fi.readlines() + best_trail_q_lists_mapping = {} + for line in lines: + q_ratio, q_list = line.split(":") + q_list = q_list.strip() + #remove [] from the string + q_list = q_list[1:-1] + q_ratio = q_ratio.strip() + best_trail_q_lists_mapping[q_ratio] = q_list + except FileNotFoundError: + best_trail_q_lists_mapping = {} + + + model_size = opt_model_sizes[i] + model_batch_size = opt_batch_size[i] + for ratio in opt_layer_wise_ratios: + search_config = opt_layer_wise_search_config_template.format(model_batch_size) + save_dir = opt_layer_wise_search_dir_template.format(model_size, ratio) + q_list = best_trail_q_lists_mapping[str(ratio)] if str(ratio) in best_trail_q_lists_mapping else "" + model_name = f"facebook/{model_size}" + run_command = opt_layerwise_run_command_palette.format(model_name=model_name, q_ratio=ratio, search_config=search_config, save_dir=save_dir, quantized_list=q_list) + # print(run_command) + f.write(run_command + "\n") + f.write("\n") + + +# qwen template for searching + +qwen15_model_sizes=["Qwen1.5-0.5B","Qwen1.5-1.8B","Qwen1.5-4B","Qwen1.5-7B","Qwen1.5-14B","Qwen1.5-32B","Qwen1.5-72B","Qwen1.5-110B"] +qwen15_batch_size = [8,8,4,2,2,2,1,1] +qwen15_layer_wise_ratios = [0.0,0.5,0.6,0.7,0.8,0.9,0.95,0.975] +qwen15_layer_wise_search_config_template = "mxint_4bit-pajama-random-50-bs-{}.toml" +qwen15_layer_wise_search_dir_template = "eval/layerwise/{}_{}" + +qwen15_layerwise_run_command_palette = "python src/full_eval.py --model_arch qwen2 --model_name {model_name} --ratio {q_ratio} --eval_config configs/search/layerwise/{search_config} --model_parallel --quantized_list \"{quantized_list}\"" + +with open("qwen1.5_layerwise_eval_commands.txt", "w") as f: + for i in range(len(qwen15_model_sizes)): + + eval_best_trail_file = "../../q_ratio_results/layerwise/qwen1.5/qwen-{}-best-trail.txt".format(qwen15_model_sizes[i].split("-")[1]) + try: + with open(eval_best_trail_file, "r") as fi: + # read each line + lines = fi.readlines() + best_trail_q_lists_mapping = {} + for line in lines: + q_ratio, q_list = line.split(":") + q_list = q_list.strip() + q_list = q_list[1:-1] + q_ratio = q_ratio.strip() + best_trail_q_lists_mapping[q_ratio] = q_list + except FileNotFoundError: + best_trail_q_lists_mapping = {} + + # print(best_trail_q_lists_mapping) + + model_size = qwen15_model_sizes[i] + model_batch_size = qwen15_batch_size[i] + for ratio in qwen15_layer_wise_ratios: + search_config = qwen15_layer_wise_search_config_template.format(model_batch_size) + save_dir = qwen15_layer_wise_search_dir_template.format(model_size, ratio) + model_name = f"Qwen/{model_size}" + q_list = best_trail_q_lists_mapping[str(ratio)] if str(ratio) in best_trail_q_lists_mapping else "" + run_command = qwen15_layerwise_run_command_palette.format(model_name=model_name, q_ratio=ratio, search_config=search_config, save_dir=save_dir, quantized_list=q_list) + # print(run_command) + f.write(run_command + "\n") + f.write("\n") + +# llama template for searching + +# qwen template for searching + +llama_model_sizes=["llama-7b","llama-13b","llama-30b","llama-65b"] +llama_batch_size = [2,2,1,1] +llama_layer_wise_ratios = [0.0,0.5,0.6,0.7,0.8,0.9,0.95,0.975] +llama_layer_wise_search_config_template = "mxint_4bit-pajama-random-50-bs-{}.toml" +llama_layer_wise_search_dir_template = "eval/layerwise/{}_{}" + +llama_layerwise_run_command_palette = "python src/full_eval.py --model_arch llama --model_name {model_name} --ratio {q_ratio} --eval_config configs/search/layerwise/{search_config} --model_parallel --quantized_list \"{quantized_list}\"" + +with open("llama_layerwise_eval_commands.txt", "w") as f: + for i in range(len(llama_model_sizes)): + + eval_best_trail_file = "../../q_ratio_results/layerwise/llama/llama-{}-best-trail.txt".format(qwen15_model_sizes[i].split("-")[1]) + try: + with open(eval_best_trail_file, "r") as fi: + # read each line + lines = fi.readlines() + best_trail_q_lists_mapping = {} + for line in lines: + q_ratio, q_list = line.split(":") + q_list = q_list.strip() + q_list = q_list[1:-1] + q_ratio = q_ratio.strip() + best_trail_q_lists_mapping[q_ratio] = q_list + except FileNotFoundError: + best_trail_q_lists_mapping = {} + + model_size = llama_model_sizes[i] + model_batch_size = llama_batch_size[i] + for ratio in llama_layer_wise_ratios: + search_config = llama_layer_wise_search_config_template.format(model_batch_size) + save_dir = llama_layer_wise_search_dir_template.format(model_size, ratio) + model_name = f"huggyllama/{model_size}" + q_list = best_trail_q_lists_mapping[str(ratio)] if str(ratio) in best_trail_q_lists_mapping else "" + run_command = llama_layerwise_run_command_palette.format(model_name=model_name, q_ratio=ratio, search_config=search_config, save_dir=save_dir, quantized_list=q_list) + # print(run_command) + f.write(run_command + "\n") + f.write("\n") \ No newline at end of file diff --git a/llm-q-scaling-law-master/eval_scripts/layerwise/llama_layerwise_eval_commands.txt b/llm-q-scaling-law-master/eval_scripts/layerwise/llama_layerwise_eval_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffddcba47aaa60db2623714f9767946d60eebd73 --- /dev/null +++ b/llm-q-scaling-law-master/eval_scripts/layerwise/llama_layerwise_eval_commands.txt @@ -0,0 +1,36 @@ +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-7b --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-7b --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-7b --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-7b --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-7b --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-7b --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-7b --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-7b --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-13b --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-13b --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-13b --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-13b --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-13b --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-13b --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-13b --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-13b --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-30b --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-30b --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-30b --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-30b --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-30b --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-30b --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-30b --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-30b --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-65b --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-65b --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-65b --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-65b --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-65b --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-65b --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-65b --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-65b --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" + diff --git a/llm-q-scaling-law-master/eval_scripts/layerwise/opt_layerwise_eval_commands.txt b/llm-q-scaling-law-master/eval_scripts/layerwise/opt_layerwise_eval_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..6def53c84a7aad32654455ca725c058d2773aa61 --- /dev/null +++ b/llm-q-scaling-law-master/eval_scripts/layerwise/opt_layerwise_eval_commands.txt @@ -0,0 +1,72 @@ +python src/full_eval.py --model_arch opt --model_name facebook/opt-125m --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-125m --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-125m --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-125m --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-125m --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-125m --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-125m --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-125m --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch opt --model_name facebook/opt-350m --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-350m --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-350m --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-350m --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-350m --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-350m --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-350m --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-350m --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch opt --model_name facebook/opt-1.3b --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-1.3b --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-1.3b --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-1.3b --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-1.3b --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-1.3b --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-1.3b --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-1.3b --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch opt --model_name facebook/opt-2.7b --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-2.7b --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-2.7b --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-2.7b --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-2.7b --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-2.7b --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-2.7b --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-2.7b --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch opt --model_name facebook/opt-6.7b --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-6.7b --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-6.7b --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-6.7b --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-6.7b --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-6.7b --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-6.7b --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-6.7b --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch opt --model_name facebook/opt-13b --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-13b --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-13b --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-13b --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-13b --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-13b --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-13b --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-13b --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch opt --model_name facebook/opt-30b --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-30b --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-30b --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-30b --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-30b --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-30b --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-30b --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-30b --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch opt --model_name facebook/opt-66b --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-66b --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-66b --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-66b --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-66b --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-66b --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-66b --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-66b --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" + diff --git a/llm-q-scaling-law-master/eval_scripts/layerwise/qwen1.5_layerwise_eval_commands.txt b/llm-q-scaling-law-master/eval_scripts/layerwise/qwen1.5_layerwise_eval_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..25a7a86a398e35c5f8b6aa1e021b9d80e7cf8fde --- /dev/null +++ b/llm-q-scaling-law-master/eval_scripts/layerwise/qwen1.5_layerwise_eval_commands.txt @@ -0,0 +1,72 @@ +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-0.5B --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-0.5B --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-0.5B --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-0.5B --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-0.5B --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-0.5B --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-0.5B --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-0.5B --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-1.8B --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-1.8B --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-1.8B --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-1.8B --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-1.8B --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-1.8B --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-1.8B --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-1.8B --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-8.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-4B --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-4B --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-4B --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-4B --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-4B --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-4B --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-4B --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-4B --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-4.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-7B --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-7B --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-7B --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-7B --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-7B --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-7B --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-7B --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-7B --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-14B --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-14B --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-14B --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-14B --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-14B --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-14B --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-14B --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-14B --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1" + +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-32B --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-32B --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-32B --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-32B --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-32B --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-32B --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-32B --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-32B --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-2.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-72B --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-72B --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-72B --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-72B --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-72B --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-72B --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-72B --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-72B --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-110B --ratio 0.0 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-110B --ratio 0.5 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-110B --ratio 0.6 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-110B --ratio 0.7 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-110B --ratio 0.8 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-110B --ratio 0.9 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-110B --ratio 0.95 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-110B --ratio 0.975 --eval_config configs/search/layerwise/mxint_4bit-alpaca-random-50-bs-1.toml --model_parallel --quantized_list "" + diff --git a/llm-q-scaling-law-master/eval_scripts/matmult/evalscript_generator.py b/llm-q-scaling-law-master/eval_scripts/matmult/evalscript_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..1e8c98a4286c4bb778c274863f46abe9f9785542 --- /dev/null +++ b/llm-q-scaling-law-master/eval_scripts/matmult/evalscript_generator.py @@ -0,0 +1,142 @@ +""" +This generates the runscripts for the LLM-Q scaling law simulations. + +""" + +import os +import argparse + +# the search command is +#python src/main.py --model_name {model_name} --search_config configs/search/mxint_4bit-tinyllama.toml --model_parallel --disable_wandb --save_dir ../ckpt/test +# the save dir is +# results/search/layerwise/{model_name}_{ratio} + +granularity = ["matmult"] + +# opt template for searching + +opt_model_sizes=["opt-125m","opt-350m","opt-1.3b","opt-2.7b","opt-6.7b","opt-13b","opt-30b","opt-66b"] +opt_batch_size = [8,8,8,8,4,2,1,1] +opt_matmult_ratios = [0.0,0.5,0.9,0.95,0.975,0.99] +opt_matmult_search_config_template = "mxint_4bit-pajama-random-50-bs-{}.toml" +opt_matmult_search_dir_template = "eval/matmult/{}_{}" + +opt_matmult_run_command_palette = "python src/full_eval.py --model_arch opt --model_name {model_name} --ratio {q_ratio} --eval_config configs/search/matmult/{search_config} --model_parallel --quantized_list \"{quantized_list}\"" + +with open("opt_matmult_eval_commands.txt", "w") as f: + for i in range(len(opt_model_sizes)): + + eval_best_trail_file = "../../q_ratio_results/matmult/opt/opt-{}-best-trail.txt".format(opt_model_sizes[i].split("-")[1]) + try: + with open(eval_best_trail_file, "r") as fi: + # read each line + lines = fi.readlines() + best_trail_q_lists_mapping = {} + for line in lines: + q_ratio, q_list = line.split(":") + q_list = q_list.strip() + #remove [] from the string + q_list = q_list[1:-1] + q_ratio = q_ratio.strip() + best_trail_q_lists_mapping[q_ratio] = q_list + except FileNotFoundError: + best_trail_q_lists_mapping = {} + + + model_size = opt_model_sizes[i] + model_batch_size = opt_batch_size[i] + for ratio in opt_matmult_ratios: + search_config = opt_matmult_search_config_template.format(model_batch_size) + save_dir = opt_matmult_search_dir_template.format(model_size, ratio) + q_list = best_trail_q_lists_mapping[str(ratio)] if str(ratio) in best_trail_q_lists_mapping else "" + model_name = f"facebook/{model_size}" + run_command = opt_matmult_run_command_palette.format(model_name=model_name, q_ratio=ratio, search_config=search_config, save_dir=save_dir, quantized_list=q_list) + # print(run_command) + f.write(run_command + "\n") + f.write("\n") + + +# qwen template for searching + +qwen15_model_sizes=["Qwen1.5-0.5B","Qwen1.5-1.8B","Qwen1.5-4B","Qwen1.5-7B","Qwen1.5-14B","Qwen1.5-32B","Qwen1.5-72B","Qwen1.5-110B"] +qwen15_batch_size = [8,8,4,2,2,2,1,1] +qwen15_matmult_ratios = [0.0,0.5,0.9,0.95,0.975,0.99] +qwen15_matmult_search_config_template = "mxint_4bit-pajama-random-50-bs-{}.toml" +qwen15_matmult_search_dir_template = "eval/matmult/{}_{}" + +qwen15_matmult_run_command_palette = "python src/full_eval.py --model_arch qwen2 --model_name {model_name} --ratio {q_ratio} --eval_config configs/search/matmult/{search_config} --model_parallel --quantized_list \"{quantized_list}\"" + +with open("qwen1.5_matmult_eval_commands.txt", "w") as f: + for i in range(len(qwen15_model_sizes)): + + eval_best_trail_file = "../../q_ratio_results/matmult/qwen1.5/qwen-{}-best-trail.txt".format(qwen15_model_sizes[i].split("-")[1]) + try: + with open(eval_best_trail_file, "r") as fi: + # print("Reading best trail file: ", eval_best_trail_file) + # read each line + lines = fi.readlines() + best_trail_q_lists_mapping = {} + for line in lines: + q_ratio, q_list = line.split(":") + q_list = q_list.strip() + q_list = q_list[1:-1] + q_ratio = q_ratio.strip() + best_trail_q_lists_mapping[q_ratio] = q_list + except FileNotFoundError: + best_trail_q_lists_mapping = {} + + # print(best_trail_q_lists_mapping) + + model_size = qwen15_model_sizes[i] + model_batch_size = qwen15_batch_size[i] + for ratio in qwen15_matmult_ratios: + search_config = qwen15_matmult_search_config_template.format(model_batch_size) + save_dir = qwen15_matmult_search_dir_template.format(model_size, ratio) + model_name = f"Qwen/{model_size}" + q_list = best_trail_q_lists_mapping[str(ratio)] if str(ratio) in best_trail_q_lists_mapping else "" + run_command = qwen15_matmult_run_command_palette.format(model_name=model_name, q_ratio=ratio, search_config=search_config, save_dir=save_dir, quantized_list=q_list) + # print(run_command) + f.write(run_command + "\n") + f.write("\n") + +# llama template for searching + +# qwen template for searching + +llama_model_sizes=["llama-7b","llama-13b","llama-30b","llama-65b"] +llama_batch_size = [2,2,1,1] +llama_matmult_ratios = [0.0,0.5,0.9,0.95,0.975,0.99] +llama_matmult_search_config_template = "mxint_4bit-pajama-random-50-bs-{}.toml" +llama_matmult_search_dir_template = "eval/matmult/{}_{}" + +llama_matmult_run_command_palette = "python src/full_eval.py --model_arch llama --model_name {model_name} --ratio {q_ratio} --eval_config configs/search/matmult/{search_config} --model_parallel --quantized_list \"{quantized_list}\"" + +with open("llama_matmult_eval_commands.txt", "w") as f: + for i in range(len(llama_model_sizes)): + + eval_best_trail_file = "../../q_ratio_results/matmult/llama/llama-{}-best-trail.txt".format(qwen15_model_sizes[i].split("-")[1]) + try: + with open(eval_best_trail_file, "r") as fi: + # read each line + lines = fi.readlines() + best_trail_q_lists_mapping = {} + for line in lines: + q_ratio, q_list = line.split(":") + q_list = q_list.strip() + q_list = q_list[1:-1] + q_ratio = q_ratio.strip() + best_trail_q_lists_mapping[q_ratio] = q_list + except FileNotFoundError: + best_trail_q_lists_mapping = {} + + model_size = llama_model_sizes[i] + model_batch_size = llama_batch_size[i] + for ratio in llama_matmult_ratios: + search_config = llama_matmult_search_config_template.format(model_batch_size) + save_dir = llama_matmult_search_dir_template.format(model_size, ratio) + model_name = f"huggyllama/{model_size}" + q_list = best_trail_q_lists_mapping[str(ratio)] if str(ratio) in best_trail_q_lists_mapping else "" + run_command = llama_matmult_run_command_palette.format(model_name=model_name, q_ratio=ratio, search_config=search_config, save_dir=save_dir, quantized_list=q_list) + # print(run_command) + f.write(run_command + "\n") + f.write("\n") \ No newline at end of file diff --git a/llm-q-scaling-law-master/eval_scripts/matmult/llama_matmult_eval_commands.txt b/llm-q-scaling-law-master/eval_scripts/matmult/llama_matmult_eval_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..3693a91891527789e9149a4f48a0c07682eaab75 --- /dev/null +++ b/llm-q-scaling-law-master/eval_scripts/matmult/llama_matmult_eval_commands.txt @@ -0,0 +1,28 @@ +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-7b --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-7b --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-7b --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-7b --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-7b --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-7b --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-13b --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-13b --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-13b --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-13b --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-13b --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-13b --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-30b --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-30b --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-30b --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-30b --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-30b --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-30b --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-65b --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-65b --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-65b --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-65b --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-65b --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch llama --model_name huggyllama/llama-65b --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" + diff --git a/llm-q-scaling-law-master/eval_scripts/matmult/opt_matmult_eval_commands.txt b/llm-q-scaling-law-master/eval_scripts/matmult/opt_matmult_eval_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff5530b2ad57e47f980cf248af49f4a4fa9e5743 --- /dev/null +++ b/llm-q-scaling-law-master/eval_scripts/matmult/opt_matmult_eval_commands.txt @@ -0,0 +1,56 @@ +python src/full_eval.py --model_arch opt --model_name facebook/opt-125m --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-125m --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-125m --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-125m --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-125m --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-125m --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch opt --model_name facebook/opt-350m --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-350m --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-350m --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-350m --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-350m --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-350m --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch opt --model_name facebook/opt-1.3b --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-1.3b --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-1.3b --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-1.3b --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-1.3b --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-1.3b --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch opt --model_name facebook/opt-2.7b --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-2.7b --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-2.7b --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-2.7b --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-2.7b --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-2.7b --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch opt --model_name facebook/opt-6.7b --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-6.7b --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-6.7b --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-6.7b --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-6.7b --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-6.7b --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch opt --model_name facebook/opt-13b --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-13b --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-13b --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-13b --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-13b --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-13b --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch opt --model_name facebook/opt-30b --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-30b --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-30b --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-30b --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-30b --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-30b --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch opt --model_name facebook/opt-66b --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-66b --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-66b --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-66b --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-66b --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch opt --model_name facebook/opt-66b --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" + diff --git a/llm-q-scaling-law-master/eval_scripts/matmult/qwen1.5_matmult_eval_commands.txt b/llm-q-scaling-law-master/eval_scripts/matmult/qwen1.5_matmult_eval_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..4511ce87151f6cc78fe05d3119b2211c1ea91296 --- /dev/null +++ b/llm-q-scaling-law-master/eval_scripts/matmult/qwen1.5_matmult_eval_commands.txt @@ -0,0 +1,56 @@ +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-0.5B --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-0.5B --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-0.5B --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-0.5B --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-0.5B --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-0.5B --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-1.8B --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-1.8B --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-1.8B --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-1.8B --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-1.8B --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-1.8B --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-4B --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-4B --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-4B --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-4B --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-4B --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-4B --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-7B --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-7B --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-7B --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-7B --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-7B --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-7B --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-14B --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-14B --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-14B --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-14B --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-14B --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-14B --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-32B --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-32B --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-32B --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-32B --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-32B --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-32B --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --quantized_list "1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1" + +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-72B --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-72B --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-72B --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-72B --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-72B --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-72B --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" + +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-110B --ratio 0.0 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-110B --ratio 0.5 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-110B --ratio 0.9 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-110B --ratio 0.95 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-110B --ratio 0.975 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" +python src/full_eval.py --model_arch qwen2 --model_name Qwen/Qwen1.5-110B --ratio 0.99 --eval_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --quantized_list "" + diff --git a/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-0.5B-best-trail.txt b/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-0.5B-best-trail.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-1.8B-best-trail.txt b/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-1.8B-best-trail.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-14B-best-trail.txt b/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-14B-best-trail.txt new file mode 100644 index 0000000000000000000000000000000000000000..32c3856d0e7efbb0f2638815dfe6e18d0dfdcf58 --- /dev/null +++ b/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-14B-best-trail.txt @@ -0,0 +1,7 @@ +0.5 : [1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0] +0.6 : [1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0] +0.7 : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0] +0.8 : [1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1] +0.9 : [1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1] +0.95 : [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1] +0.975 : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1] \ No newline at end of file diff --git a/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-32B-best-trail.txt b/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-32B-best-trail.txt new file mode 100644 index 0000000000000000000000000000000000000000..7988062a15fc2a93ca70d780cd80c2aee11524e8 --- /dev/null +++ b/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-32B-best-trail.txt @@ -0,0 +1,8 @@ +0.0 : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] +0.5 : [1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0] +0.6 : [0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0] +0.7 : [1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0] +0.8 : [1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0] +0.9 : [0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1] +0.95 : [0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0] +0.975 : \ No newline at end of file diff --git a/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-4B-best-trail.txt b/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-4B-best-trail.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-72B-best-trail.txt b/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-72B-best-trail.txt new file mode 100644 index 0000000000000000000000000000000000000000..507134a371ca54e24db085eb303f7aff16c96c72 --- /dev/null +++ b/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-72B-best-trail.txt @@ -0,0 +1,8 @@ +0.0 : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] +0.5 : [0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0] +0.6 : [1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1] +0.7 : [0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0] +0.8 : [1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1] +0.9 : [0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1] +0.95 : [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] +0.975 : \ No newline at end of file diff --git a/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-7B-best-trail.txt b/llm-q-scaling-law-master/q_ratio_results/layerwise/qwen1.5/qwen-7B-best-trail.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llm-q-scaling-law-master/q_ratio_results/matmult/qwen1.5/qwen-32B-best-trail.txt b/llm-q-scaling-law-master/q_ratio_results/matmult/qwen1.5/qwen-32B-best-trail.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf84e30d27d25b400a34cc99ea86d15d17fd65a8 --- /dev/null +++ b/llm-q-scaling-law-master/q_ratio_results/matmult/qwen1.5/qwen-32B-best-trail.txt @@ -0,0 +1,6 @@ +0.0 : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] +0.5 : [1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0] +0.9 : [1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0] +0.95 : [1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1] +0.975 : [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] +0.99 : [1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \ No newline at end of file diff --git a/llm-q-scaling-law-master/requirements.txt b/llm-q-scaling-law-master/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..7630eeaa3b03aff4c36318659c12e3a2df15bf3f --- /dev/null +++ b/llm-q-scaling-law-master/requirements.txt @@ -0,0 +1,26 @@ +torch==2.3.1 +transformers==4.42.3 +datasets==2.20.0 +nvitop +accelerate +joblib +optuna +wandb +toml + +evaluate>=0.4.0 +jsonlines +numexpr +peft>=0.2.0 +pybind11>=2.6.2 +pytablewriter +rouge-score>=0.0.4 +sacrebleu>=1.5.0 +scikit-learn>=0.24.1 +sqlitedict +tqdm-multiprocess +zstandard +dill +word2number +more_itertools +sentencepiece \ No newline at end of file diff --git a/llm-q-scaling-law-master/run_scripts/env_command.sh b/llm-q-scaling-law-master/run_scripts/env_command.sh new file mode 100644 index 0000000000000000000000000000000000000000..03337ff98f8ec2f8972da59a50d3ae59573e710f --- /dev/null +++ b/llm-q-scaling-law-master/run_scripts/env_command.sh @@ -0,0 +1,23 @@ +export CUDA_VISIBLE_DEVICES=0 +conda activate llm-mixed-q + +export CUDA_VISIBLE_DEVICES=1 +conda activate llm-mixed-q + +export CUDA_VISIBLE_DEVICES=2 +conda activate llm-mixed-q + +export CUDA_VISIBLE_DEVICES=3 +conda activate llm-mixed-q + +export CUDA_VISIBLE_DEVICES=4 +conda activate llm-mixed-q + +export CUDA_VISIBLE_DEVICES=5 +conda activate llm-mixed-q + +export CUDA_VISIBLE_DEVICES=6 +conda activate llm-mixed-q + +export CUDA_VISIBLE_DEVICES=7 +conda activate llm-mixed-q \ No newline at end of file diff --git a/llm-q-scaling-law-master/run_scripts/layerwise/llama2_layerwise_run_commands.txt b/llm-q-scaling-law-master/run_scripts/layerwise/llama2_layerwise_run_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..39586d1e41e0001905080f36c3cf9c36af0d39a4 --- /dev/null +++ b/llm-q-scaling-law-master/run_scripts/layerwise/llama2_layerwise_run_commands.txt @@ -0,0 +1,24 @@ +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-7b-chat-hf --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-7b_0.5 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-7b_0.5 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-7b-chat-hf --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-7b_0.6 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-7b_0.6 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-7b-chat-hf --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-7b_0.7 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-7b_0.7 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-7b-chat-hf --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-7b_0.8 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-7b_0.8 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-7b-chat-hf --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-7b_0.9 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-7b_0.9 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-7b-chat-hf --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-7b_0.95 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-7b_0.95 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-7b-chat-hf --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-7b_0.975 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-7b_0.975 + +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-13b-chat-hf --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-13b_0.5 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-13b_0.5 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-13b-chat-hf --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-13b_0.6 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-13b_0.6 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-13b-chat-hf --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-13b_0.7 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-13b_0.7 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-13b-chat-hf --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-13b_0.8 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-13b_0.8 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-13b-chat-hf --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-13b_0.9 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-13b_0.9 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-13b-chat-hf --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-13b_0.95 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-13b_0.95 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-13b-chat-hf --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-13b_0.975 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-13b_0.975 + +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-70b-chat-hf --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Llama-2-70b_0.5 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-70b_0.5 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-70b-chat-hf --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Llama-2-70b_0.6 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-70b_0.6 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-70b-chat-hf --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Llama-2-70b_0.7 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-70b_0.7 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-70b-chat-hf --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Llama-2-70b_0.8 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-70b_0.8 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-70b-chat-hf --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Llama-2-70b_0.9 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-70b_0.9 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-70b-chat-hf --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Llama-2-70b_0.95 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-70b_0.95 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-70b-chat-hf --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Llama-2-70b_0.975 --wandb_group llama-layer-search --save_dir results/search/layerwise/Llama-2-70b_0.975 + diff --git a/llm-q-scaling-law-master/run_scripts/layerwise/llama_layerwise_run_commands.txt b/llm-q-scaling-law-master/run_scripts/layerwise/llama_layerwise_run_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5f05a01e3190ebb521bb7ec8fd4b86717fb2632 --- /dev/null +++ b/llm-q-scaling-law-master/run_scripts/layerwise/llama_layerwise_run_commands.txt @@ -0,0 +1,32 @@ +python src/main.py --model_arch llama --model_name huggyllama/llama-7b --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-7b_0.5 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-7b_0.5 +python src/main.py --model_arch llama --model_name huggyllama/llama-7b --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-7b_0.6 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-7b_0.6 +python src/main.py --model_arch llama --model_name huggyllama/llama-7b --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-7b_0.7 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-7b_0.7 +python src/main.py --model_arch llama --model_name huggyllama/llama-7b --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-7b_0.8 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-7b_0.8 +python src/main.py --model_arch llama --model_name huggyllama/llama-7b --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-7b_0.9 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-7b_0.9 +python src/main.py --model_arch llama --model_name huggyllama/llama-7b --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-7b_0.95 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-7b_0.95 +python src/main.py --model_arch llama --model_name huggyllama/llama-7b --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-7b_0.975 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-7b_0.975 + +python src/main.py --model_arch llama --model_name huggyllama/llama-13b --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-13b_0.5 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-13b_0.5 +python src/main.py --model_arch llama --model_name huggyllama/llama-13b --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-13b_0.6 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-13b_0.6 +python src/main.py --model_arch llama --model_name huggyllama/llama-13b --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-13b_0.7 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-13b_0.7 +python src/main.py --model_arch llama --model_name huggyllama/llama-13b --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-13b_0.8 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-13b_0.8 +python src/main.py --model_arch llama --model_name huggyllama/llama-13b --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-13b_0.9 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-13b_0.9 +python src/main.py --model_arch llama --model_name huggyllama/llama-13b --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-13b_0.95 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-13b_0.95 +python src/main.py --model_arch llama --model_name huggyllama/llama-13b --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-13b_0.975 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-13b_0.975 + +python src/main.py --model_arch llama --model_name huggyllama/llama-30b --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-30b_0.5 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-30b_0.5 +python src/main.py --model_arch llama --model_name huggyllama/llama-30b --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-30b_0.6 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-30b_0.6 +python src/main.py --model_arch llama --model_name huggyllama/llama-30b --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-30b_0.7 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-30b_0.7 +python src/main.py --model_arch llama --model_name huggyllama/llama-30b --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-30b_0.8 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-30b_0.8 +python src/main.py --model_arch llama --model_name huggyllama/llama-30b --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-30b_0.9 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-30b_0.9 +python src/main.py --model_arch llama --model_name huggyllama/llama-30b --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-30b_0.95 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-30b_0.95 +python src/main.py --model_arch llama --model_name huggyllama/llama-30b --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-30b_0.975 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-30b_0.975 + +python src/main.py --model_arch llama --model_name huggyllama/llama-65b --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-65b_0.5 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-65b_0.5 +python src/main.py --model_arch llama --model_name huggyllama/llama-65b --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-65b_0.6 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-65b_0.6 +python src/main.py --model_arch llama --model_name huggyllama/llama-65b --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-65b_0.7 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-65b_0.7 +python src/main.py --model_arch llama --model_name huggyllama/llama-65b --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-65b_0.8 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-65b_0.8 +python src/main.py --model_arch llama --model_name huggyllama/llama-65b --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-65b_0.9 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-65b_0.9 +python src/main.py --model_arch llama --model_name huggyllama/llama-65b --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-65b_0.95 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-65b_0.95 +python src/main.py --model_arch llama --model_name huggyllama/llama-65b --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-65b_0.975 --wandb_group llama-layer-search --save_dir results/search/layerwise/llama-65b_0.975 + diff --git a/llm-q-scaling-law-master/run_scripts/layerwise/opt_layerwise_run_commands.txt b/llm-q-scaling-law-master/run_scripts/layerwise/opt_layerwise_run_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f487b3a6a448cba744c733fd5b9bb51d5d1dee8 --- /dev/null +++ b/llm-q-scaling-law-master/run_scripts/layerwise/opt_layerwise_run_commands.txt @@ -0,0 +1,64 @@ +python src/main.py --model_arch opt --model_name facebook/opt-125m --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-125m_0.5 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-125m_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-125m --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-125m_0.6 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-125m_0.6 +python src/main.py --model_arch opt --model_name facebook/opt-125m --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-125m_0.7 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-125m_0.7 +python src/main.py --model_arch opt --model_name facebook/opt-125m --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-125m_0.8 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-125m_0.8 +python src/main.py --model_arch opt --model_name facebook/opt-125m --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-125m_0.9 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-125m_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-125m --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-125m_0.95 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-125m_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-125m --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-125m_0.975 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-125m_0.975 + +python src/main.py --model_arch opt --model_name facebook/opt-350m --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-350m_0.5 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-350m_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-350m --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-350m_0.6 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-350m_0.6 +python src/main.py --model_arch opt --model_name facebook/opt-350m --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-350m_0.7 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-350m_0.7 +python src/main.py --model_arch opt --model_name facebook/opt-350m --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-350m_0.8 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-350m_0.8 +python src/main.py --model_arch opt --model_name facebook/opt-350m --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-350m_0.9 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-350m_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-350m --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-350m_0.95 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-350m_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-350m --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-350m_0.975 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-350m_0.975 + +python src/main.py --model_arch opt --model_name facebook/opt-1.3b --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-1.3b_0.5 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-1.3b_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-1.3b --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-1.3b_0.6 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-1.3b_0.6 +python src/main.py --model_arch opt --model_name facebook/opt-1.3b --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-1.3b_0.7 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-1.3b_0.7 +python src/main.py --model_arch opt --model_name facebook/opt-1.3b --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-1.3b_0.8 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-1.3b_0.8 +python src/main.py --model_arch opt --model_name facebook/opt-1.3b --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-1.3b_0.9 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-1.3b_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-1.3b --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-1.3b_0.95 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-1.3b_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-1.3b --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-1.3b_0.975 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-1.3b_0.975 + +python src/main.py --model_arch opt --model_name facebook/opt-2.7b --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-2.7b_0.5 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-2.7b_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-2.7b --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-2.7b_0.6 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-2.7b_0.6 +python src/main.py --model_arch opt --model_name facebook/opt-2.7b --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-2.7b_0.7 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-2.7b_0.7 +python src/main.py --model_arch opt --model_name facebook/opt-2.7b --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-2.7b_0.8 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-2.7b_0.8 +python src/main.py --model_arch opt --model_name facebook/opt-2.7b --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-2.7b_0.9 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-2.7b_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-2.7b --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-2.7b_0.95 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-2.7b_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-2.7b --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-2.7b_0.975 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-2.7b_0.975 + +python src/main.py --model_arch opt --model_name facebook/opt-6.7b --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name opt-6.7b_0.5 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-6.7b_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-6.7b --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name opt-6.7b_0.6 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-6.7b_0.6 +python src/main.py --model_arch opt --model_name facebook/opt-6.7b --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name opt-6.7b_0.7 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-6.7b_0.7 +python src/main.py --model_arch opt --model_name facebook/opt-6.7b --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name opt-6.7b_0.8 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-6.7b_0.8 +python src/main.py --model_arch opt --model_name facebook/opt-6.7b --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name opt-6.7b_0.9 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-6.7b_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-6.7b --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name opt-6.7b_0.95 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-6.7b_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-6.7b --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name opt-6.7b_0.975 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-6.7b_0.975 + +python src/main.py --model_arch opt --model_name facebook/opt-13b --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name opt-13b_0.5 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-13b_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-13b --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name opt-13b_0.6 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-13b_0.6 +python src/main.py --model_arch opt --model_name facebook/opt-13b --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name opt-13b_0.7 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-13b_0.7 +python src/main.py --model_arch opt --model_name facebook/opt-13b --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name opt-13b_0.8 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-13b_0.8 +python src/main.py --model_arch opt --model_name facebook/opt-13b --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name opt-13b_0.9 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-13b_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-13b --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name opt-13b_0.95 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-13b_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-13b --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name opt-13b_0.975 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-13b_0.975 + +python src/main.py --model_arch opt --model_name facebook/opt-30b --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-30b_0.5 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-30b_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-30b --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-30b_0.6 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-30b_0.6 +python src/main.py --model_arch opt --model_name facebook/opt-30b --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-30b_0.7 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-30b_0.7 +python src/main.py --model_arch opt --model_name facebook/opt-30b --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-30b_0.8 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-30b_0.8 +python src/main.py --model_arch opt --model_name facebook/opt-30b --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-30b_0.9 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-30b_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-30b --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-30b_0.95 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-30b_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-30b --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-30b_0.975 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-30b_0.975 + +python src/main.py --model_arch opt --model_name facebook/opt-66b --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-66b_0.5 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-66b_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-66b --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-66b_0.6 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-66b_0.6 +python src/main.py --model_arch opt --model_name facebook/opt-66b --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-66b_0.7 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-66b_0.7 +python src/main.py --model_arch opt --model_name facebook/opt-66b --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-66b_0.8 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-66b_0.8 +python src/main.py --model_arch opt --model_name facebook/opt-66b --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-66b_0.9 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-66b_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-66b --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-66b_0.95 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-66b_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-66b --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-66b_0.975 --wandb_group opt-layer-search --save_dir results/search/layerwise/opt-66b_0.975 + diff --git a/llm-q-scaling-law-master/run_scripts/layerwise/qwen1.5_layerwise_run_commands.txt b/llm-q-scaling-law-master/run_scripts/layerwise/qwen1.5_layerwise_run_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..faa0a99e92ac984c34fabb6c0200bd7414d9b308 --- /dev/null +++ b/llm-q-scaling-law-master/run_scripts/layerwise/qwen1.5_layerwise_run_commands.txt @@ -0,0 +1,64 @@ +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-0.5B --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-0.5B_0.5 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-0.5B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-0.5B --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-0.5B_0.6 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-0.5B_0.6 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-0.5B --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-0.5B_0.7 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-0.5B_0.7 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-0.5B --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-0.5B_0.8 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-0.5B_0.8 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-0.5B --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-0.5B_0.9 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-0.5B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-0.5B --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-0.5B_0.95 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-0.5B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-0.5B --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-0.5B_0.975 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-0.5B_0.975 + +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-1.8B --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-1.8B_0.5 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-1.8B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-1.8B --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-1.8B_0.6 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-1.8B_0.6 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-1.8B --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-1.8B_0.7 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-1.8B_0.7 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-1.8B --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-1.8B_0.8 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-1.8B_0.8 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-1.8B --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-1.8B_0.9 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-1.8B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-1.8B --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-1.8B_0.95 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-1.8B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-1.8B --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-1.8B_0.975 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-1.8B_0.975 + +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-4B --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name Qwen1.5-4B_0.5 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-4B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-4B --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name Qwen1.5-4B_0.6 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-4B_0.6 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-4B --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name Qwen1.5-4B_0.7 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-4B_0.7 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-4B --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name Qwen1.5-4B_0.8 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-4B_0.8 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-4B --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name Qwen1.5-4B_0.9 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-4B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-4B --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name Qwen1.5-4B_0.95 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-4B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-4B --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name Qwen1.5-4B_0.975 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-4B_0.975 + +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-7B --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-7B_0.5 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-7B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-7B --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-7B_0.6 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-7B_0.6 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-7B --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-7B_0.7 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-7B_0.7 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-7B --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-7B_0.8 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-7B_0.8 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-7B --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-7B_0.9 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-7B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-7B --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-7B_0.95 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-7B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-7B --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-7B_0.975 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-7B_0.975 + +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-14B --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-14B_0.5 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-14B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-14B --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-14B_0.6 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-14B_0.6 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-14B --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-14B_0.7 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-14B_0.7 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-14B --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-14B_0.8 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-14B_0.8 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-14B --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-14B_0.9 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-14B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-14B --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-14B_0.95 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-14B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-14B --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-14B_0.975 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-14B_0.975 + +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-32B --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-32B_0.5 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-32B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-32B --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-32B_0.6 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-32B_0.6 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-32B --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-32B_0.7 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-32B_0.7 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-32B --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-32B_0.8 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-32B_0.8 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-32B --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-32B_0.9 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-32B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-32B --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-32B_0.95 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-32B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-32B --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-32B_0.975 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-32B_0.975 + +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-72B --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-72B_0.5 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-72B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-72B --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-72B_0.6 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-72B_0.6 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-72B --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-72B_0.7 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-72B_0.7 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-72B --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-72B_0.8 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-72B_0.8 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-72B --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-72B_0.9 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-72B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-72B --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-72B_0.95 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-72B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-72B --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-72B_0.975 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-72B_0.975 + +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-110B --q_ratio 0.5 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-110B_0.5 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-110B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-110B --q_ratio 0.6 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-110B_0.6 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-110B_0.6 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-110B --q_ratio 0.7 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-110B_0.7 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-110B_0.7 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-110B --q_ratio 0.8 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-110B_0.8 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-110B_0.8 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-110B --q_ratio 0.9 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-110B_0.9 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-110B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-110B --q_ratio 0.95 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-110B_0.95 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-110B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-110B --q_ratio 0.975 --search_config configs/search/layerwise/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-110B_0.975 --wandb_group qwen15-layer-search --save_dir results/search/layerwise/Qwen1.5-110B_0.975 + diff --git a/llm-q-scaling-law-master/run_scripts/layerwise/runscript_generator.py b/llm-q-scaling-law-master/run_scripts/layerwise/runscript_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..fd577216fa62679d52fb964c9dfe03fa7b344f5d --- /dev/null +++ b/llm-q-scaling-law-master/run_scripts/layerwise/runscript_generator.py @@ -0,0 +1,111 @@ +""" +This generates the runscripts for the LLM-Q scaling law simulations. + +""" + +import os +import argparse + +# the search command is +#python src/main.py --model_name {model_name} --search_config configs/search/mxint_4bit-tinyllama.toml --model_parallel --disable_wandb --save_dir ../ckpt/test +# the save dir is +# results/search/layerwise/{model_name}_{ratio} + +granularity = ["transformer_layer"] + +# opt template for searching + +opt_model_sizes=["opt-125m","opt-350m","opt-1.3b","opt-2.7b","opt-6.7b","opt-13b","opt-30b","opt-66b"] +opt_batch_size = [8,8,8,8,4,2,1,1] +opt_layer_wise_ratios = [0.5,0.6,0.7,0.8,0.9,0.95,0.975] +opt_layer_wise_search_config_template = "mxint_4bit-pajama-random-50-bs-{}.toml" +opt_layer_wise_search_dir_template = "search/layerwise/{}_{}" + +opt_layerwise_run_command_palette = "python src/main.py --model_arch opt --model_name {model_name} --q_ratio {q_ratio} --search_config configs/search/layerwise/{search_config} --model_parallel --wandb_name {wandb_name} --wandb_group opt-layer-search --save_dir results/{save_dir}" + +with open("opt_layerwise_run_commands.txt", "w") as f: + for i in range(len(opt_model_sizes)): + model_size = opt_model_sizes[i] + model_batch_size = opt_batch_size[i] + for ratio in opt_layer_wise_ratios: + search_config = opt_layer_wise_search_config_template.format(model_batch_size) + save_dir = opt_layer_wise_search_dir_template.format(model_size, ratio) + model_name = f"facebook/{model_size}" + wandb_name = f"{model_size}_{ratio}" + run_command = opt_layerwise_run_command_palette.format(model_name=model_name, q_ratio=ratio, search_config=search_config, wandb_name=wandb_name, save_dir=save_dir) + # print(run_command) + f.write(run_command + "\n") + f.write("\n") + + +# qwen template for searching + +qwen15_model_sizes=["Qwen1.5-0.5B","Qwen1.5-1.8B","Qwen1.5-4B","Qwen1.5-7B","Qwen1.5-14B","Qwen1.5-32B","Qwen1.5-72B","Qwen1.5-110B"] +qwen15_batch_size = [8,8,4,2,2,2,1,1] +qwen15_layer_wise_ratios = [0.5,0.6,0.7,0.8,0.9,0.95,0.975] +qwen15_layer_wise_search_config_template = "mxint_4bit-pajama-random-50-bs-{}.toml" +qwen15_layer_wise_search_dir_template = "search/layerwise/{}_{}" + +qwen15_layerwise_run_command_palette = "python src/main.py --model_arch qwen1.5 --model_name {model_name} --q_ratio {q_ratio} --search_config configs/search/layerwise/{search_config} --model_parallel --wandb_name {wandb_name} --wandb_group qwen15-layer-search --save_dir results/{save_dir}" + +with open("qwen1.5_layerwise_run_commands.txt", "w") as f: + for i in range(len(qwen15_model_sizes)): + model_size = qwen15_model_sizes[i] + model_batch_size = qwen15_batch_size[i] + for ratio in qwen15_layer_wise_ratios: + search_config = qwen15_layer_wise_search_config_template.format(model_batch_size) + save_dir = qwen15_layer_wise_search_dir_template.format(model_size, ratio) + model_name = f"Qwen/{model_size}" + wandb_name = f"{model_size}_{ratio}" + run_command = qwen15_layerwise_run_command_palette.format(model_name=model_name, q_ratio=ratio, search_config=search_config, wandb_name=wandb_name, save_dir=save_dir) + # print(run_command) + f.write(run_command + "\n") + f.write("\n") + +# llama template for searching + +# qwen template for searching + +llama_model_sizes=["llama-7b","llama-13b","llama-30b","llama-65b"] +llama_batch_size = [2,2,1,1] +llama_layer_wise_ratios = [0.5,0.6,0.7,0.8,0.9,0.95,0.975] +llama_layer_wise_search_config_template = "mxint_4bit-pajama-random-50-bs-{}.toml" +llama_layer_wise_search_dir_template = "search/layerwise/{}_{}" + +llama_layerwise_run_command_palette = "python src/main.py --model_arch llama --model_name {model_name} --q_ratio {q_ratio} --search_config configs/search/layerwise/{search_config} --model_parallel --wandb_name {wandb_name} --wandb_group llama-layer-search --save_dir results/{save_dir}" + +with open("llama_layerwise_run_commands.txt", "w") as f: + for i in range(len(llama_model_sizes)): + model_size = llama_model_sizes[i] + model_batch_size = llama_batch_size[i] + for ratio in llama_layer_wise_ratios: + search_config = llama_layer_wise_search_config_template.format(model_batch_size) + save_dir = llama_layer_wise_search_dir_template.format(model_size, ratio) + model_name = f"huggyllama/{model_size}" + wandb_name = f"{model_size}_{ratio}" + run_command = llama_layerwise_run_command_palette.format(model_name=model_name, q_ratio=ratio, search_config=search_config, wandb_name=wandb_name, save_dir=save_dir) + # print(run_command) + f.write(run_command + "\n") + f.write("\n") + +llama2_model_sizes=["Llama-2-7b","Llama-2-13b","Llama-2-70b"] +llama2_batch_size = [2,2,1,1] +llama2_layer_wise_ratios = [0.5,0.6,0.7,0.8,0.9,0.95,0.975] +llama2_layer_wise_search_config_template = "mxint_4bit-pajama-random-50-bs-{}.toml" +llama2_layer_wise_search_dir_template = "search/layerwise/{}_{}" + +llama2_layerwise_run_command_palette = "python src/main.py --model_arch llama2 --model_name {model_name} --q_ratio {q_ratio} --search_config configs/search/layerwise/{search_config} --model_parallel --wandb_name {wandb_name} --wandb_group llama-layer-search --save_dir results/{save_dir}" + +with open("llama2_layerwise_run_commands.txt", "w") as f: + for i in range(len(llama2_model_sizes)): + model_size = llama2_model_sizes[i] + model_batch_size = llama2_batch_size[i] + for ratio in llama2_layer_wise_ratios: + search_config = llama2_layer_wise_search_config_template.format(model_batch_size) + save_dir = llama2_layer_wise_search_dir_template.format(model_size, ratio) + model_name = f"meta-llama/{model_size}-chat-hf" # use the instruction tempalte + wandb_name = f"{model_size}_{ratio}" + run_command = llama2_layerwise_run_command_palette.format(model_name=model_name, q_ratio=ratio, search_config=search_config, wandb_name=wandb_name, save_dir=save_dir) + # print(run_command) + f.write(run_command + "\n") + f.write("\n") \ No newline at end of file diff --git a/llm-q-scaling-law-master/run_scripts/matmult/llama2_matmult_run_commands.txt b/llm-q-scaling-law-master/run_scripts/matmult/llama2_matmult_run_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..bce622c65f4e6b528a5a16f5f48890bbda811fb1 --- /dev/null +++ b/llm-q-scaling-law-master/run_scripts/matmult/llama2_matmult_run_commands.txt @@ -0,0 +1,18 @@ +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-7b-chat-hf --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-7b_0.5 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-7b_0.5 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-7b-chat-hf --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-7b_0.9 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-7b_0.9 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-7b-chat-hf --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-7b_0.95 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-7b_0.95 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-7b-chat-hf --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-7b_0.975 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-7b_0.975 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-7b-chat-hf --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-7b_0.99 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-7b_0.99 + +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-13b-chat-hf --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-13b_0.5 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-13b_0.5 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-13b-chat-hf --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-13b_0.9 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-13b_0.9 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-13b-chat-hf --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-13b_0.95 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-13b_0.95 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-13b-chat-hf --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-13b_0.975 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-13b_0.975 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-13b-chat-hf --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Llama-2-13b_0.99 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-13b_0.99 + +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-70b-chat-hf --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Llama-2-70b_0.5 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-70b_0.5 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-70b-chat-hf --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Llama-2-70b_0.9 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-70b_0.9 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-70b-chat-hf --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Llama-2-70b_0.95 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-70b_0.95 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-70b-chat-hf --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Llama-2-70b_0.975 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-70b_0.975 +python src/main.py --model_arch llama2 --model_name meta-llama/Llama-2-70b-chat-hf --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Llama-2-70b_0.99 --wandb_group llama-layer-search --save_dir results/search/matmult/Llama-2-70b_0.99 + diff --git a/llm-q-scaling-law-master/run_scripts/matmult/llama_matmult_run_commands.txt b/llm-q-scaling-law-master/run_scripts/matmult/llama_matmult_run_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9edb20f83ea68bcdb23e588508db6d295815962 --- /dev/null +++ b/llm-q-scaling-law-master/run_scripts/matmult/llama_matmult_run_commands.txt @@ -0,0 +1,24 @@ +python src/main.py --model_arch llama --model_name huggyllama/llama-7b --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-7b_0.5 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-7b_0.5 +python src/main.py --model_arch llama --model_name huggyllama/llama-7b --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-7b_0.9 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-7b_0.9 +python src/main.py --model_arch llama --model_name huggyllama/llama-7b --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-7b_0.95 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-7b_0.95 +python src/main.py --model_arch llama --model_name huggyllama/llama-7b --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-7b_0.975 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-7b_0.975 +python src/main.py --model_arch llama --model_name huggyllama/llama-7b --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-7b_0.99 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-7b_0.99 + +python src/main.py --model_arch llama --model_name huggyllama/llama-13b --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-13b_0.5 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-13b_0.5 +python src/main.py --model_arch llama --model_name huggyllama/llama-13b --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-13b_0.9 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-13b_0.9 +python src/main.py --model_arch llama --model_name huggyllama/llama-13b --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-13b_0.95 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-13b_0.95 +python src/main.py --model_arch llama --model_name huggyllama/llama-13b --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-13b_0.975 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-13b_0.975 +python src/main.py --model_arch llama --model_name huggyllama/llama-13b --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name llama-13b_0.99 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-13b_0.99 + +python src/main.py --model_arch llama --model_name huggyllama/llama-30b --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-30b_0.5 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-30b_0.5 +python src/main.py --model_arch llama --model_name huggyllama/llama-30b --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-30b_0.9 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-30b_0.9 +python src/main.py --model_arch llama --model_name huggyllama/llama-30b --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-30b_0.95 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-30b_0.95 +python src/main.py --model_arch llama --model_name huggyllama/llama-30b --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-30b_0.975 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-30b_0.975 +python src/main.py --model_arch llama --model_name huggyllama/llama-30b --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-30b_0.99 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-30b_0.99 + +python src/main.py --model_arch llama --model_name huggyllama/llama-65b --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-65b_0.5 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-65b_0.5 +python src/main.py --model_arch llama --model_name huggyllama/llama-65b --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-65b_0.9 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-65b_0.9 +python src/main.py --model_arch llama --model_name huggyllama/llama-65b --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-65b_0.95 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-65b_0.95 +python src/main.py --model_arch llama --model_name huggyllama/llama-65b --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-65b_0.975 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-65b_0.975 +python src/main.py --model_arch llama --model_name huggyllama/llama-65b --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name llama-65b_0.99 --wandb_group llama-matmult-search --save_dir results/search/matmult/llama-65b_0.99 + diff --git a/llm-q-scaling-law-master/run_scripts/matmult/opt_matmult_run_commands.txt b/llm-q-scaling-law-master/run_scripts/matmult/opt_matmult_run_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..df6e0771c95ffa19bde3c97c7e1aee2ca4130ba8 --- /dev/null +++ b/llm-q-scaling-law-master/run_scripts/matmult/opt_matmult_run_commands.txt @@ -0,0 +1,48 @@ +python src/main.py --model_arch opt --model_name facebook/opt-125m --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-125m_0.5 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-125m_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-125m --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-125m_0.9 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-125m_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-125m --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-125m_0.95 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-125m_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-125m --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-125m_0.975 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-125m_0.975 +python src/main.py --model_arch opt --model_name facebook/opt-125m --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-125m_0.99 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-125m_0.99 + +python src/main.py --model_arch opt --model_name facebook/opt-350m --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-350m_0.5 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-350m_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-350m --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-350m_0.9 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-350m_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-350m --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-350m_0.95 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-350m_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-350m --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-350m_0.975 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-350m_0.975 +python src/main.py --model_arch opt --model_name facebook/opt-350m --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-350m_0.99 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-350m_0.99 + +python src/main.py --model_arch opt --model_name facebook/opt-1.3b --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-1.3b_0.5 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-1.3b_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-1.3b --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-1.3b_0.9 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-1.3b_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-1.3b --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-1.3b_0.95 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-1.3b_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-1.3b --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-1.3b_0.975 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-1.3b_0.975 +python src/main.py --model_arch opt --model_name facebook/opt-1.3b --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-1.3b_0.99 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-1.3b_0.99 + +python src/main.py --model_arch opt --model_name facebook/opt-2.7b --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-2.7b_0.5 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-2.7b_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-2.7b --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-2.7b_0.9 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-2.7b_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-2.7b --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-2.7b_0.95 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-2.7b_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-2.7b --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-2.7b_0.975 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-2.7b_0.975 +python src/main.py --model_arch opt --model_name facebook/opt-2.7b --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name opt-2.7b_0.99 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-2.7b_0.99 + +python src/main.py --model_arch opt --model_name facebook/opt-6.7b --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name opt-6.7b_0.5 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-6.7b_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-6.7b --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name opt-6.7b_0.9 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-6.7b_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-6.7b --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name opt-6.7b_0.95 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-6.7b_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-6.7b --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name opt-6.7b_0.975 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-6.7b_0.975 +python src/main.py --model_arch opt --model_name facebook/opt-6.7b --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name opt-6.7b_0.99 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-6.7b_0.99 + +python src/main.py --model_arch opt --model_name facebook/opt-13b --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name opt-13b_0.5 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-13b_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-13b --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name opt-13b_0.9 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-13b_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-13b --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name opt-13b_0.95 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-13b_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-13b --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name opt-13b_0.975 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-13b_0.975 +python src/main.py --model_arch opt --model_name facebook/opt-13b --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name opt-13b_0.99 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-13b_0.99 + +python src/main.py --model_arch opt --model_name facebook/opt-30b --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-30b_0.5 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-30b_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-30b --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-30b_0.9 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-30b_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-30b --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-30b_0.95 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-30b_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-30b --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-30b_0.975 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-30b_0.975 +python src/main.py --model_arch opt --model_name facebook/opt-30b --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-30b_0.99 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-30b_0.99 + +python src/main.py --model_arch opt --model_name facebook/opt-66b --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-66b_0.5 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-66b_0.5 +python src/main.py --model_arch opt --model_name facebook/opt-66b --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-66b_0.9 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-66b_0.9 +python src/main.py --model_arch opt --model_name facebook/opt-66b --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-66b_0.95 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-66b_0.95 +python src/main.py --model_arch opt --model_name facebook/opt-66b --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-66b_0.975 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-66b_0.975 +python src/main.py --model_arch opt --model_name facebook/opt-66b --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name opt-66b_0.99 --wandb_group opt-matmult-search --save_dir results/search/matmult/opt-66b_0.99 + diff --git a/llm-q-scaling-law-master/run_scripts/matmult/qwen1.5_matmult_run_commands.txt b/llm-q-scaling-law-master/run_scripts/matmult/qwen1.5_matmult_run_commands.txt new file mode 100644 index 0000000000000000000000000000000000000000..02368e1b11a39f8bac8c74d4fb7cd9471c752f8f --- /dev/null +++ b/llm-q-scaling-law-master/run_scripts/matmult/qwen1.5_matmult_run_commands.txt @@ -0,0 +1,48 @@ +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-0.5B --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-0.5B_0.5 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-0.5B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-0.5B --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-0.5B_0.9 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-0.5B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-0.5B --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-0.5B_0.95 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-0.5B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-0.5B --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-0.5B_0.975 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-0.5B_0.975 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-0.5B --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-0.5B_0.99 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-0.5B_0.99 + +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-1.8B --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-1.8B_0.5 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-1.8B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-1.8B --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-1.8B_0.9 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-1.8B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-1.8B --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-1.8B_0.95 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-1.8B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-1.8B --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-1.8B_0.975 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-1.8B_0.975 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-1.8B --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-8.toml --model_parallel --wandb_name Qwen1.5-1.8B_0.99 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-1.8B_0.99 + +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-4B --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name Qwen1.5-4B_0.5 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-4B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-4B --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name Qwen1.5-4B_0.9 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-4B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-4B --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name Qwen1.5-4B_0.95 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-4B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-4B --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name Qwen1.5-4B_0.975 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-4B_0.975 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-4B --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-4.toml --model_parallel --wandb_name Qwen1.5-4B_0.99 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-4B_0.99 + +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-7B --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-7B_0.5 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-7B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-7B --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-7B_0.9 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-7B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-7B --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-7B_0.95 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-7B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-7B --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-7B_0.975 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-7B_0.975 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-7B --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-7B_0.99 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-7B_0.99 + +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-14B --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-14B_0.5 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-14B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-14B --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-14B_0.9 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-14B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-14B --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-14B_0.95 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-14B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-14B --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-14B_0.975 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-14B_0.975 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-14B --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-14B_0.99 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-14B_0.99 + +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-32B --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-32B_0.5 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-32B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-32B --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-32B_0.9 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-32B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-32B --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-32B_0.95 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-32B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-32B --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-32B_0.975 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-32B_0.975 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-32B --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-2.toml --model_parallel --wandb_name Qwen1.5-32B_0.99 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-32B_0.99 + +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-72B --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-72B_0.5 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-72B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-72B --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-72B_0.9 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-72B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-72B --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-72B_0.95 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-72B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-72B --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-72B_0.975 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-72B_0.975 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-72B --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-72B_0.99 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-72B_0.99 + +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-110B --q_ratio 0.5 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-110B_0.5 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-110B_0.5 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-110B --q_ratio 0.9 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-110B_0.9 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-110B_0.9 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-110B --q_ratio 0.95 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-110B_0.95 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-110B_0.95 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-110B --q_ratio 0.975 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-110B_0.975 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-110B_0.975 +python src/main.py --model_arch qwen1.5 --model_name Qwen/Qwen1.5-110B --q_ratio 0.99 --search_config configs/search/matmult/mxint_4bit-pajama-random-50-bs-1.toml --model_parallel --wandb_name Qwen1.5-110B_0.99 --wandb_group qwen15-matmult-search --save_dir results/search/matmult/Qwen1.5-110B_0.99 + diff --git a/llm-q-scaling-law-master/run_scripts/matmult/runscript_generator.py b/llm-q-scaling-law-master/run_scripts/matmult/runscript_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..8b6d2043f6e6f741f19b41aeeb3e2664cd07a012 --- /dev/null +++ b/llm-q-scaling-law-master/run_scripts/matmult/runscript_generator.py @@ -0,0 +1,111 @@ +""" +This generates the runscripts for the LLM-Q scaling law simulations. + +""" + +import os +import argparse + +# the search command is +#python src/main.py --model_name {model_name} --search_config configs/search/mxint_4bit-tinyllama.toml --model_parallel --disable_wandb --save_dir ../ckpt/test +# the save dir is +# results/search/layerwise/{model_name}_{ratio} + +granularity = ["matmult"] + +# opt template for searching + +opt_model_sizes=["opt-125m","opt-350m","opt-1.3b","opt-2.7b","opt-6.7b","opt-13b","opt-30b","opt-66b"] +opt_batch_size = [8,8,8,8,4,2,1,1] +opt_matmult_ratios = [0.5,0.9,0.95,0.975,0.99] +opt_matmult_search_config_template = "mxint_4bit-pajama-random-50-bs-{}.toml" +opt_matmult_search_dir_template = "search/matmult/{}_{}" + +opt_matmult_run_command_palette = "python src/main.py --model_arch opt --model_name {model_name} --q_ratio {q_ratio} --search_config configs/search/matmult/{search_config} --model_parallel --wandb_name {wandb_name} --wandb_group opt-matmult-search --save_dir results/{save_dir}" + +with open("opt_matmult_run_commands.txt", "w") as f: + for i in range(len(opt_model_sizes)): + model_size = opt_model_sizes[i] + model_batch_size = opt_batch_size[i] + for ratio in opt_matmult_ratios: + search_config = opt_matmult_search_config_template.format(model_batch_size) + save_dir = opt_matmult_search_dir_template.format(model_size, ratio) + model_name = f"facebook/{model_size}" + wandb_name = f"{model_size}_{ratio}" + run_command = opt_matmult_run_command_palette.format(model_name=model_name, q_ratio=ratio, search_config=search_config, wandb_name=wandb_name, save_dir=save_dir) + # print(run_command) + f.write(run_command + "\n") + f.write("\n") + + +# qwen template for searching + +qwen15_model_sizes=["Qwen1.5-0.5B","Qwen1.5-1.8B","Qwen1.5-4B","Qwen1.5-7B","Qwen1.5-14B","Qwen1.5-32B","Qwen1.5-72B","Qwen1.5-110B"] +qwen15_batch_size = [8,8,4,2,2,2,1,1] +qwen15_matmult_ratios = [0.5,0.9,0.95,0.975,0.99] +qwen15_matmult_search_config_template = "mxint_4bit-pajama-random-50-bs-{}.toml" +qwen15_matmult_search_dir_template = "search/matmult/{}_{}" + +qwen15_matmult_run_command_palette = "python src/main.py --model_arch qwen1.5 --model_name {model_name} --q_ratio {q_ratio} --search_config configs/search/matmult/{search_config} --model_parallel --wandb_name {wandb_name} --wandb_group qwen15-matmult-search --save_dir results/{save_dir}" + +with open("qwen1.5_matmult_run_commands.txt", "w") as f: + for i in range(len(qwen15_model_sizes)): + model_size = qwen15_model_sizes[i] + model_batch_size = qwen15_batch_size[i] + for ratio in qwen15_matmult_ratios: + search_config = qwen15_matmult_search_config_template.format(model_batch_size) + save_dir = qwen15_matmult_search_dir_template.format(model_size, ratio) + model_name = f"Qwen/{model_size}" + wandb_name = f"{model_size}_{ratio}" + run_command = qwen15_matmult_run_command_palette.format(model_name=model_name, q_ratio=ratio, search_config=search_config, wandb_name=wandb_name, save_dir=save_dir) + # print(run_command) + f.write(run_command + "\n") + f.write("\n") + +# llama template for searching + +# qwen template for searching + +llama_model_sizes=["llama-7b","llama-13b","llama-30b","llama-65b"] +llama_batch_size = [2,2,1,1] +llama_matmult_ratios = [0.5,0.9,0.95,0.975,0.99] +llama_matmult_search_config_template = "mxint_4bit-pajama-random-50-bs-{}.toml" +llama_matmult_search_dir_template = "search/matmult/{}_{}" + +llama_matmult_run_command_palette = "python src/main.py --model_arch llama --model_name {model_name} --q_ratio {q_ratio} --search_config configs/search/matmult/{search_config} --model_parallel --wandb_name {wandb_name} --wandb_group llama-matmult-search --save_dir results/{save_dir}" + +with open("llama_matmult_run_commands.txt", "w") as f: + for i in range(len(llama_model_sizes)): + model_size = llama_model_sizes[i] + model_batch_size = llama_batch_size[i] + for ratio in llama_matmult_ratios: + search_config = llama_matmult_search_config_template.format(model_batch_size) + save_dir = llama_matmult_search_dir_template.format(model_size, ratio) + model_name = f"huggyllama/{model_size}" + wandb_name = f"{model_size}_{ratio}" + run_command = llama_matmult_run_command_palette.format(model_name=model_name, q_ratio=ratio, search_config=search_config, wandb_name=wandb_name, save_dir=save_dir) + # print(run_command) + f.write(run_command + "\n") + f.write("\n") + +llama2_model_sizes=["Llama-2-7b","Llama-2-13b","Llama-2-70b"] +llama2_batch_size = [2,2,1,1] +llama2_matmult_ratios = [0.5,0.9,0.95,0.975,0.99] +llama2_matmult_search_config_template = "mxint_4bit-pajama-random-50-bs-{}.toml" +llama2_matmult_search_dir_template = "search/matmult/{}_{}" + +llama2_matmult_run_command_palette = "python src/main.py --model_arch llama2 --model_name {model_name} --q_ratio {q_ratio} --search_config configs/search/matmult/{search_config} --model_parallel --wandb_name {wandb_name} --wandb_group llama-layer-search --save_dir results/{save_dir}" + +with open("llama2_matmult_run_commands.txt", "w") as f: + for i in range(len(llama2_model_sizes)): + model_size = llama2_model_sizes[i] + model_batch_size = llama2_batch_size[i] + for ratio in llama2_matmult_ratios: + search_config = llama2_matmult_search_config_template.format(model_batch_size) + save_dir = llama2_matmult_search_dir_template.format(model_size, ratio) + model_name = f"meta-llama/{model_size}-chat-hf" # use the instruction tempalte + wandb_name = f"{model_size}_{ratio}" + run_command = llama2_matmult_run_command_palette.format(model_name=model_name, q_ratio=ratio, search_config=search_config, wandb_name=wandb_name, save_dir=save_dir) + # print(run_command) + f.write(run_command + "\n") + f.write("\n") \ No newline at end of file diff --git a/llm-q-scaling-law-master/scripts/download_model.sh b/llm-q-scaling-law-master/scripts/download_model.sh new file mode 100644 index 0000000000000000000000000000000000000000..cffcfc0602b939606a67bf08b40db0cc9e75e097 --- /dev/null +++ b/llm-q-scaling-law-master/scripts/download_model.sh @@ -0,0 +1,8 @@ +huggingface-cli download facebook/opt-125m +huggingface-cli download facebook/opt-350m +huggingface-cli download facebook/opt-1.3b +huggingface-cli download facebook/opt-2.7b +huggingface-cli download facebook/opt-6.7b +huggingface-cli download facebook/opt-13b +huggingface-cli download facebook/opt-30b +huggingface-cli download facebook/opt-66b \ No newline at end of file diff --git a/llm-q-scaling-law-master/scripts/hf_opt_mmlu.sh b/llm-q-scaling-law-master/scripts/hf_opt_mmlu.sh new file mode 100644 index 0000000000000000000000000000000000000000..20218cdf7e2fa7d5c8a67e80fb5d020c814583e0 --- /dev/null +++ b/llm-q-scaling-law-master/scripts/hf_opt_mmlu.sh @@ -0,0 +1,45 @@ +lm_eval --model hf \ + --model_args pretrained=facebook/opt-125m,dtype="half" \ + --tasks mmlu \ + --device cuda:0 \ + --batch_size auto:4 + +lm_eval --model hf \ + --model_args pretrained=facebook/opt-350m,dtype="half" \ + --tasks mmlu \ + --device cuda:0 \ + --batch_size auto:4 + +lm_eval --model hf \ + --model_args pretrained=facebook/opt-1.3b,dtype="half" \ + --tasks mmlu \ + --device cuda:0 \ + --batch_size auto:4 + +lm_eval --model hf \ + --model_args pretrained=facebook/opt-2.7b,dtype="half" \ + --tasks mmlu \ + --device cuda:0 \ + --batch_size auto:4 + +lm_eval --model hf \ + --model_args pretrained=facebook/opt-6.7b,dtype="half" \ + --tasks mmlu \ + --device cuda:0 \ + --batch_size auto:4 + +lm_eval --model hf \ + --model_args pretrained=facebook/opt-13b,dtype="half" \ + --tasks mmlu \ + --device cuda:0 \ + --batch_size auto:4 + +lm_eval --model hf \ + --tasks mmlu \ + --model_args parallelize=True,pretrained=facebook/opt-30b,dtype="half" \ + --batch_size 4 + +lm_eval --model hf \ + --tasks mmlu \ + --model_args parallelize=True,pretrained=facebook/opt-66b,dtype="half" \ + --batch_size auto:4 diff --git a/llm-q-scaling-law-master/src/__init__.py b/llm-q-scaling-law-master/src/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/llm-q-scaling-law-master/src/custom_harness_tasks/scaling_law_benchmarks/scaling_law_easy.yaml b/llm-q-scaling-law-master/src/custom_harness_tasks/scaling_law_benchmarks/scaling_law_easy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f84dbd131cc224d834f4b5f8fc61adec8bedd767 --- /dev/null +++ b/llm-q-scaling-law-master/src/custom_harness_tasks/scaling_law_benchmarks/scaling_law_easy.yaml @@ -0,0 +1,10 @@ +group: scaling_law_easy +task: + - task: commonsense_qa # Commonsense + num_fewshot: 7 + - task: winogrande # Commonsense + fewshot_split: train + num_fewshot: 5 + - task: arc_challenge # Math, reasoning, and problem solving + fewshot_split: validation + num_fewshot: 25 diff --git a/llm-q-scaling-law-master/src/custom_harness_tasks/scaling_law_benchmarks/scaling_law_hard.yaml b/llm-q-scaling-law-master/src/custom_harness_tasks/scaling_law_benchmarks/scaling_law_hard.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b421c3554b38266705c900d02c2d3f65522577df --- /dev/null +++ b/llm-q-scaling-law-master/src/custom_harness_tasks/scaling_law_benchmarks/scaling_law_hard.yaml @@ -0,0 +1,6 @@ +group: scaling_law_hard +task: + - leaderboard_mmlu_pro # Math, reasoning, and problem solving + - leaderboard_bbh # aggregate + - leaderboard_math_hard # Math, reasoning, and problem solving + - gsm8k_cot # long context diff --git a/llm-q-scaling-law-master/src/full_eval.py b/llm-q-scaling-law-master/src/full_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..02f12bf87e2fe1d074424977ef8bf88388e05d0f --- /dev/null +++ b/llm-q-scaling-law-master/src/full_eval.py @@ -0,0 +1,333 @@ +import ast +import copy +from typing import List +import os +from argparse import ArgumentParser +from pathlib import Path + +from transformers import set_seed +import torch + +from accelerate import ( + infer_auto_device_map, + dispatch_model, +) + +import sys +from contextlib import contextmanager, redirect_stdout +from os import devnull + + +# hack + +from llm_q_scaling_law.hf_model_map import ( + get_config_cls, + get_model_cls, + get_tokenizer_cls, +) +from llm_q_scaling_law.utils import load_config + +# temporary +from llm_q_scaling_law.models import quantize_transformer + +import os + +sys.path.append( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "lm-evaluation-harness", + ) +) + +os.environ["PYTHONBREAKPOINT"] = "ipdb.set_trace" +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +from lm_eval.evaluator import simple_evaluate, TaskManager +from lm_eval.models.huggingface import HFLM + +task_manager = TaskManager( + include_path=Path(__file__).resolve().parent.joinpath("custom_harness_tasks").as_posix(), + include_defaults=True, +) + + +@contextmanager +def silent_harness_evaluator(): + with open(devnull, "w") as fnull: + with redirect_stdout(fnull) as out: + yield out + + +def create_device_map(model, device_map) -> dict[str, int]: + if device_map == "auto": + device_map = infer_auto_device_map( + model, + no_split_module_classes=model._no_split_modules, + ) + elif device_map == "auto-balanced": + max_memory = {i: torch.cuda.mem_get_info(i)[0] // 2 for i in range(torch.cuda.device_count())} + device_map = infer_auto_device_map( + model, + no_split_module_classes=model._no_split_modules, + max_memory=max_memory, + ) + n_devices = torch.cuda.device_count() + n_decoder_layers = model.config.num_hidden_layers + n_layers_per_device = n_decoder_layers // n_devices + balanced_device_map = {} + current_device = 0 + current_decoder_idx = 0 + + for layer_name in device_map: + if ".layers." in layer_name: + if (current_decoder_idx + 1) % n_layers_per_device == 0: + current_device += 1 + current_decoder_idx += 1 + balanced_device_map[layer_name] = min(current_device, n_devices - 1) + device_map = balanced_device_map + else: + assert isinstance(device_map, dict) + return device_map + + +def my_dispatch_model(model, device, device_map): + if device_map: + model = dispatch_model(model, device_map) + else: + model.to(device) + return model + + +def calculate_mmlu_metric(result_dict): + # special handling for mmlu metric + avg_mmlu = result_dict["results"]["mmlu"]["acc,none"] + mmlu_stem = result_dict["results"]["mmlu_stem"]["acc,none"] + mmlu_humanities = result_dict["results"]["mmlu_humanities"]["acc,none"] + mmlu_social_science = result_dict["results"]["mmlu_social_sciences"]["acc,none"] + mmlu_other = result_dict["results"]["mmlu_other"]["acc,none"] + + # return avg_mmlu, mmlu_stem, mmlu_humanities, mmlu_social_science, mmlu_other + return avg_mmlu + + +def calculate_sw_metric(result_dict): + avg_acc = [] + for task in result_dict["results"]: + if "acc,none" in result_dict["results"][task]: + acc_entry = "acc,none" + else: + raise ValueError(f"Failed to find 'acc' entry in {task} result:\n{result_dict['results'][task]}") + avg_acc.append(result_dict["results"][task][acc_entry]) + avg_acc = sum(avg_acc) / len(avg_acc) + + return avg_acc + + +class EvalBase: + def __init__( + self, + model_arch: str, + model_name: str, + eval_config: dict | str, + device: str, + model_parallel: bool = False, + local_hf_cache: str = None, + ) -> None: + self.model_arch = model_arch + self.model_name = model_name + self.model_cls = get_model_cls(model_arch, "lm") + self.local_hf_cache = local_hf_cache + self.config_cls = get_config_cls(model_arch) + self.tokenizer = get_tokenizer_cls(model_arch).from_pretrained(model_name, legacy=False) + self.model_config = self.config_cls.from_pretrained(model_name) + self.device = device + self.model_parallel = model_parallel + + self.eval_config = eval_config if isinstance(eval_config, dict) else load_config(eval_config) + self.dtype = torch.bfloat16 + + self.load_model_from_hf() + + def load_model_from_hf(self): + config = self.config_cls.from_pretrained( + self.model_name, torch_dtype=torch.bfloat16, _attn_implementation="eager" + ) + config.use_cache = False + + self.device_map = None + + if self.local_hf_cache: + model = self.model_cls.from_pretrained( + self.model_name, + config=config, + local_files_only=True, + cache_dir=self.local_hf_cache, + # torch_dtype=self.dtype, + ) + else: + model = self.model_cls.from_pretrained( + self.model_name, + config=config, + # torch_dtype=self.dtype + ) + + if self.model_parallel: + if hasattr(model, "tie_weights"): + model.tie_weights() + + if "device_map" not in self.eval_config["setup"]: + device_map = create_device_map(model, "auto-balanced") + else: + _device_map = self.eval_config["setup"]["device_map"] + if isinstance(_device_map, str) and _device_map in ["auto", "auto-balanced"]: + device_map = create_device_map(model, _device_map) + else: + device_map = ast.literal_eval(_device_map) + + self.device_map = device_map + else: + self.device_map = None + + self.model = model + + +class EvalRunner(EvalBase): + def __init__( + self, + model_arch: str, + model_name: str, + seq_len: int, + eval_config: dict | str, + device: str, + model_parallel: bool = False, + local_hf_cache: str = None, + quantized_list: List[int] = None, + ) -> None: + super().__init__( + model_arch, + model_name, + eval_config, + device, + model_parallel, + local_hf_cache, + ) + + self.tasks = self.eval_config["evaluation"]["tasks"] + self.batch_size = self.eval_config["evaluation"]["batch_size"] + self.seq_len = seq_len + self.quantized_list = quantized_list + + def eval(self): + q_model = copy.deepcopy(self.model) + + granularity = self.eval_config["setup"]["granularity"] + + if granularity == "matmult": + _granularity = "gemm" + elif granularity == "transformer_layer": + _granularity = "decoder_layer" + else: + raise ValueError(f"Unknown granularity {granularity}") + + op_ids = [] + for i, is_q in enumerate(self.quantized_list): + if is_q: + op_ids.append(i) + + q_config_raw = self.eval_config["quantization"] + + q_model, q_config_full = quantize_transformer( + q_model, q_config=q_config_raw, op_ids=op_ids, granularity=_granularity + ) + + result_dict = self.eval_task(my_dispatch_model(q_model, self.device, self.device_map), self.batch_size) + + # special treatment for aggregated metrics such as mmlu + if self.tasks == "mmlu": + result = calculate_mmlu_metric(result_dict) + else: + result = calculate_sw_metric(result_dict) + + return result + + def eval_task( + self, + model, + batch_size: int, + ): + with silent_harness_evaluator(): + result_dict = simple_evaluate( + model=HFLM(model, tokenizer=self.tokenizer), + tasks=self.tasks, + batch_size=batch_size, + limit=None, # full evaluation + task_manager=task_manager, + ) + return result_dict + + +def cli(): + parser = ArgumentParser() + parser.add_argument("--model_arch", type=str, required=True) + parser.add_argument("--model_name", type=str, required=True) + parser.add_argument("--eval_config", type=str, required=True) + parser.add_argument("--ratio", type=str, required=True) + parser.add_argument("--accelerator", type=str, default="cuda:0") + parser.add_argument("--model_parallel", action="store_true") + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--local_hf_cache", type=str, default=None) + # read in the quantized list + parser.add_argument("--quantized_list", type=str, default=None) + args = parser.parse_args() + + # convert quantized_list to a list of int + if args.quantized_list is not None: + args.quantized_list = [int(i) for i in args.quantized_list.split(",")] + else: + args.quantized_list = [] + + eval_config = load_config(args.eval_config) + + if args.ratio is not None: + # allow overriding the q_ratio from the command line + print("overwriting q_ratio from command line: {}".format(args.ratio)) + eval_config["setup"]["ratio"] = args.ratio + + if args.seed is not None: + set_seed(args.seed) + + eval_runner = EvalRunner( + model_arch=args.model_arch, + model_name=args.model_name, + seq_len=1024, + eval_config=eval_config, + device=args.accelerator, + model_parallel=args.model_parallel, + local_hf_cache=args.local_hf_cache, + quantized_list=args.quantized_list, + ) + + result = eval_runner.eval() + + print("Evaluation Average Metric:") + print(result) + + processed_model_name = args.model_name.split("/")[-1] + + # appen the result string to the log file + file_name = f"result/eval_log/{processed_model_name}-{args.ratio}.txt" + # create the subfolder if it does not exist + os.makedirs("result/eval_log", exist_ok=True) + if not os.path.exists(file_name): + with open(file_name, "w") as f: + f.write("") + + with open(file_name, "a") as f: + # f.write( + # f"model_arch: {args.model_arch}, model_name: {args.model_name}, ratio:{args.ratio}, quantized_list: {args.quantized_list}\n" + # ) + f.write(str(result) + "\n") + + +if __name__ == "__main__": + cli() diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/__init__.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..73385cadde92b2b34b91e813bdc86ad7d556e8b7 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/__init__.py @@ -0,0 +1,5 @@ +import sys +import os +from pathlib import Path + +SCALING_LAW_SRC = Path(__file__).parent.resolve() diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_datasets/__init__.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6b0197f612ff30150157616da43044e6adc19801 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_datasets/__init__.py @@ -0,0 +1,64 @@ +import logging + +import datasets + +from .glue import GLUE_TASKS +from .glue import get_num_labels as get_num_labels_glue +from .glue import get_raw_dataset_dict as get_raw_dataset_dict_glue +from .glue import preprocess_dataset_dict as preprocess_dataset_dict_glue +from .wikitext2 import get_raw_dataset_dict as get_raw_dataset_dict_wikitext2 +from .wikitext2 import \ + preprocess_dataset_dict as preprocess_dataset_dict_wikitext2 + +logger = logging.getLogger(__name__) + + +def get_num_labels(task: str): + if task in GLUE_TASKS: + return get_num_labels_glue(task) + elif task == "wikitext2": + logger.warning( + "returning None for num_labels for language modeling dataset wikitext2" + ) + return None + else: + raise ValueError(f"task {task} not supported") + + +def get_raw_dataset_dict(task: str) -> datasets.DatasetDict: + if task in GLUE_TASKS: + return get_raw_dataset_dict_glue(task) + elif task == "wikitext2": + return get_raw_dataset_dict_wikitext2() + else: + raise ValueError(f"task {task} not supported") + + +def preprocess_dataset_dict( + raw_dataset_dict, task: str, tokenizer, padding, max_length +) -> datasets.DatasetDict: + if task in GLUE_TASKS: + return preprocess_dataset_dict_glue( + raw_dataset_dict, + task=task, + tokenizer=tokenizer, + padding=padding, + max_length=max_length, + ) + elif task == "wikitext2": + return preprocess_dataset_dict_wikitext2( + raw_dataset_dict, + tokenizer=tokenizer, + max_length=max_length, + ) + else: + raise ValueError(f"task {task} not supported") + + +def is_regression_task(task: str) -> bool: + if task in GLUE_TASKS: + return task == "stsb" + elif task == "wikitext2": + return False + else: + raise ValueError(f"task {task} not supported") diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_datasets/glue.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_datasets/glue.py new file mode 100644 index 0000000000000000000000000000000000000000..3c6af80512173a3bfd243a52150e02f049e02c15 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_datasets/glue.py @@ -0,0 +1,75 @@ +import datasets as hf_datasets + +TASK_TO_KEYS = { + "cola": ("sentence", None), + "mnli": ("premise", "hypothesis"), + "mrpc": ("sentence1", "sentence2"), + "qnli": ("question", "sentence"), + "qqp": ("question1", "question2"), + "rte": ("sentence1", "sentence2"), + "sst2": ("sentence", None), + "stsb": ("sentence1", "sentence2"), + "wnli": ("sentence1", "sentence2"), +} + +GLUE_TASKS = list(TASK_TO_KEYS.keys()) + + +def get_num_labels(task: str): + assert task in TASK_TO_KEYS, f"task {task} not supported" + + raw_datasets = hf_datasets.load_dataset("glue", task) + is_regression = task == "stsb" + + if not is_regression: + label_list = raw_datasets["train"].features["label"].names + num_labels = len(label_list) + else: + num_labels = 1 + + return num_labels + + +def get_raw_dataset_dict(task: str) -> hf_datasets.DatasetDict: + assert task in TASK_TO_KEYS, f"task {task} not supported" + raw_datasets = hf_datasets.load_dataset("glue", task) + return raw_datasets + + +def preprocess_dataset_dict( + raw_dataset_dict, task: str, tokenizer, padding, max_length +) -> hf_datasets.DatasetDict: + assert task in TASK_TO_KEYS, f"task {task} not supported" + sentence1_key, sentence2_key = TASK_TO_KEYS[task] + + def preprocess_fn(examples): + texts = ( + (examples[sentence1_key],) + if sentence2_key is None + else (examples[sentence1_key], examples[sentence2_key]) + ) + result = tokenizer( + *texts, + padding=padding, + max_length=max_length, + truncation=True, + ) + result["labels"] = examples["label"] + return result + + processed_dataset = raw_dataset_dict.map( + preprocess_fn, + batched=True, + remove_columns=raw_dataset_dict["train"].column_names, + desc="Running tokenizer on dataset", + ) + train_dataset = processed_dataset["train"] + val_dataset = processed_dataset[ + "validation_matched" if task == "mnli" else "validation" + ] + test_dataset = processed_dataset["test_matched" if task == "mnli" else "test"] + return hf_datasets.DatasetDict( + train=train_dataset, + validation=val_dataset, + test=test_dataset, + ) diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_datasets/wikitext2.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_datasets/wikitext2.py new file mode 100644 index 0000000000000000000000000000000000000000..34a02ac9a4176e9c9f1453001c0dd67856795394 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_datasets/wikitext2.py @@ -0,0 +1,55 @@ +import os + +import datasets as hf_datasets + + +def get_raw_dataset_dict() -> hf_datasets.DatasetDict: + dataset_dict = hf_datasets.load_dataset("wikitext", "wikitext-2-raw-v1") + return dataset_dict + + +def preprocess_dataset_dict( + raw_dataset_dict, + tokenizer, + max_length, +) -> hf_datasets.DatasetDict: + if tokenizer.pad_token in ["", None]: + tokenizer.pad_token = tokenizer.eos_token + + def tokenize_function(examples): + return tokenizer([" ".join(examples["text"])]) + + encodings = raw_dataset_dict.map( + tokenize_function, + batched=True, + remove_columns=raw_dataset_dict["train"].column_names, + desc="Running tokenizer on dataset", + num_proc=os.cpu_count() // 2, + ) + + def group_texts(examples): + # Concatenate all texts. + # >>> sum([[1,2,3],[4,5,6]],[]) + # [1, 2, 3, 4, 5, 6] + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= max_length: + total_length = (total_length // max_length) * max_length + # Split by chunks of block_size. + result = { + k: [t[i : i + max_length] for i in range(0, total_length, max_length)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + preprocessed = encodings.map( + group_texts, + batched=True, + num_proc=os.cpu_count() // 2, + desc="Grouping texts", + ) + + return preprocessed diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_tasks/huggingface_evaluator.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_tasks/huggingface_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..92a753cbc5683b49d52730a069012abce0044352 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_tasks/huggingface_evaluator.py @@ -0,0 +1,52 @@ +import numpy as np +import torch +from tqdm import tqdm + +RANDOM_SEED = 42 + +def huggingface_dataset_process_limit( + dataset, + limit + ): + """ + Process the dataset with the specified limit with torch subset + if limit > 1, randomly sample a subset of the data with the specified size + if limit <= 1, randomly sample a subset of the data with the specified ratio, + use limit = 1 for full dataset + """ + if limit > 1: + # randomly sample a subset of the data with the specified size + np.random.seed(RANDOM_SEED) + subsample_indices = np.random.choice(len(dataset), limit, replace=False) + dataset = dataset.select(subsample_indices) + + if limit <= 1: + # randomly sample a subset of the data with the specified size + np.random.seed(RANDOM_SEED) + total_size = len(dataset) + subsample_size = total_size * limit + subsample_indices = np.random.choice(len(dataset), subsample_size, replace=False) + dataset = dataset.select(subsample_indices) + + return dataset + + +def huggingface_ppl_evaluator( + model, + tokenizer, + train_loader, + device + ): + + perplexities = [] + + for i,batch in tqdm(enumerate(train_loader), total=len(train_loader),desc="Calculating perplexity: "): + inputs = batch["input_ids"].to(device) + attention_mask = batch["attention_mask"].to(device) + with torch.no_grad(): + outputs = model(inputs, labels=inputs, attention_mask=attention_mask) + loss = outputs.loss.item() + perplexity = np.exp(loss) + perplexities.append(perplexity) + + return perplexities diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_tasks/open_alpaca.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_tasks/open_alpaca.py new file mode 100644 index 0000000000000000000000000000000000000000..a3ebd5cd8a27c7cb9424c82d761e713e0e791de7 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_tasks/open_alpaca.py @@ -0,0 +1,99 @@ +import numpy as np +import datasets +import torch +from transformers import DataCollatorForLanguageModeling + +from ..custom_tasks.huggingface_evaluator import ( + huggingface_ppl_evaluator, + huggingface_dataset_process_limit, +) + +# silence transformers logging warning about token length exceeding max_length +from transformers.utils import logging +logging.set_verbosity(40) + +def preprocess_data_module_open_alpaca( + raw_data_module, + tokenizer, + max_length, + num_proc: int, +) -> datasets.DatasetDict: + if tokenizer.pad_token in ["", None]: + tokenizer.pad_token = tokenizer.eos_token + + def tokenize_function(examples): + return tokenizer(["\n\n".join(examples["text"])]) + + encodings = raw_data_module.map( + tokenize_function, + batched=True, + remove_columns=raw_data_module["train"].column_names, + desc="Running tokenizer on dataset", + num_proc=num_proc, + ) + + def group_texts(examples): + # Concatenate all texts. + # >>> sum([[1,2,3],[4,5,6]],[]) + # [1, 2, 3, 4, 5, 6] + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= max_length: + total_length = (total_length // max_length) * max_length + # Split by chunks of block_size. + result = { + k: [t[i : i + max_length] for i in range(0, total_length, max_length)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + preprocessed = encodings.map( + group_texts, + batched=True, + num_proc=num_proc, + desc="Grouping texts", + ) + + return preprocessed + + +def alpaca_evaluator(model, tokenizer, batch_size, limit): + dataset_dict = datasets.load_dataset("tatsu-lab/alpaca") + preprocessed_dataset_dict = preprocess_data_module_open_alpaca( + dataset_dict, tokenizer, 1024, 64 + ) + train_dataset = preprocessed_dataset_dict["train"] + + train_dataset = huggingface_dataset_process_limit(train_dataset, limit) + + # get the perplexity from output + device = next(iter(model.parameters())).device + data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size=batch_size, + collate_fn=data_collator, + shuffle=False, + num_workers=8, + ) + + perplexities = huggingface_ppl_evaluator( + model, + tokenizer, + train_loader, + device + ) + + result_dict = {} + result_dict["results"] = perplexities + return result_dict + + +def calculate_alpaca_ppl(result_dict): + ppls = result_dict["results"] + avg_ppl = np.mean(ppls) + # round_avg_ppl = round(avg_ppl, 2) + return -avg_ppl # this is a hack for unidirection optuna search diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_tasks/slim_pajama.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_tasks/slim_pajama.py new file mode 100644 index 0000000000000000000000000000000000000000..ad61260500eda77822bc5b7722aed5a9d5fff599 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/custom_tasks/slim_pajama.py @@ -0,0 +1,101 @@ +import numpy as np +import datasets +import torch +from transformers import DataCollatorForLanguageModeling + +from ..custom_tasks.huggingface_evaluator import ( + huggingface_ppl_evaluator, + huggingface_dataset_process_limit, +) + +# silence transformers logging warning about token length exceeding max_length +from transformers.utils import logging +logging.set_verbosity(40) + +def preprocess_data_module_slim_pajama_6b( + raw_data_module, + tokenizer, + max_length, + num_proc: int, +) -> datasets.DatasetDict: + if tokenizer.pad_token in ["", None]: + tokenizer.pad_token = tokenizer.eos_token + + def tokenize_function(examples): + return tokenizer(["\n\n".join(examples["text"])]) + + encodings = raw_data_module.map( + tokenize_function, + batched=True, + remove_columns=raw_data_module["train"].column_names, + desc="Running tokenizer on dataset", + num_proc=num_proc, + ) + + def group_texts(examples): + # Concatenate all texts. + # >>> sum([[1,2,3],[4,5,6]],[]) + # [1, 2, 3, 4, 5, 6] + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= max_length: + total_length = (total_length // max_length) * max_length + # Split by chunks of block_size. + result = { + k: [t[i : i + max_length] for i in range(0, total_length, max_length)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + preprocessed = encodings.map( + group_texts, + batched=True, + num_proc=num_proc, + desc="Grouping texts", + ) + + return preprocessed + + +def pajama_evaluator(model, tokenizer, batch_size, limit): + dataset_list = datasets.load_dataset("DKYoon/SlimPajama-6B", split=['train[:1%]','test']) + dataset_dict = datasets.DatasetDict({"train": dataset_list[0], "test": dataset_list[1]}) + + preprocessed_dataset_dict = preprocess_data_module_slim_pajama_6b( + dataset_dict, tokenizer, 1024, 64 + ) + train_dataset = preprocessed_dataset_dict["test"] + + train_dataset = huggingface_dataset_process_limit(train_dataset, limit) + + # get the perplexity from output + device = next(iter(model.parameters())).device + data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size=batch_size, + collate_fn=data_collator, + shuffle=False, + num_workers=8, + ) + + perplexities = huggingface_ppl_evaluator( + model, + tokenizer, + train_loader, + device + ) + + result_dict = {} + result_dict["results"] = perplexities + return result_dict + + +def calculate_pajama_ppl(result_dict): + ppls = result_dict["results"] + avg_ppl = np.mean(ppls) + # round_avg_ppl = round(avg_ppl, 2) + return -avg_ppl # this is a hack for unidirection optuna search diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/hf_model_map.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/hf_model_map.py new file mode 100644 index 0000000000000000000000000000000000000000..4a0dc5176840516532c169574da313e41d654851 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/hf_model_map.py @@ -0,0 +1,101 @@ +from transformers import AutoTokenizer +from transformers.models.bert.tokenization_bert import BertTokenizer +from transformers.models.llama.tokenization_llama import LlamaTokenizer +from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer + + +from transformers import BertForSequenceClassification, BertConfig + +from transformers import ( + LlamaForCausalLM, LlamaForSequenceClassification, LlamaConfig) + +from transformers import ( + OPTForCausalLM, OPTForSequenceClassification, OPTConfig) + +from transformers import ( + Qwen2ForCausalLM, Qwen2ForSequenceClassification, Qwen2Config +) + +from transformers import ( + PhiForCausalLM, PhiForSequenceClassification, PhiConfig +) + +from transformers import ( + Phi3ForCausalLM, Phi3ForSequenceClassification, Phi3Config +) + +from transformers import ( + Gemma2ForCausalLM, Gemma2ForSequenceClassification, Gemma2Config +) + +model_map = { + "bert": { + "cls": BertForSequenceClassification, + }, + "llama": { + "cls": LlamaForSequenceClassification, + "lm": LlamaForCausalLM, + }, + "opt": { + "cls": OPTForSequenceClassification, + "lm": OPTForCausalLM, + }, + "qwen2": { + "cls": Qwen2ForSequenceClassification, + "lm": Qwen2ForCausalLM, + }, + "phi": { + "cls": PhiForSequenceClassification, + "lm": PhiForCausalLM, + }, + "phi3": { + "cls": Phi3ForSequenceClassification, + "lm": Phi3ForCausalLM, + }, + "gemma2": { + "cls": Gemma2ForSequenceClassification, + "lm": Gemma2ForCausalLM, + }, +} + +tokenizer_map = { + "bert": BertTokenizer, + "llama": LlamaTokenizer, + "opt": AutoTokenizer, + "qwen2": Qwen2Tokenizer, + "phi": AutoTokenizer, + "phi3": AutoTokenizer, + "gemma2": AutoTokenizer, +} + +config_map = { + "bert": BertConfig, + "llama": LlamaConfig, + "opt": OPTConfig, + "qwen2": Qwen2Config, + "phi": PhiConfig, + "phi3": Phi3Config, + "gemma2": Gemma2Config, +} + + + +def get_model_cls(arch: str, task: str): + assert arch in model_map, f"arch {arch} not supported" + assert task in model_map[arch], f"task {task} not supported for arch {arch}" + return model_map[arch][task] + + +def get_config_cls(arch: str): + assert arch in config_map, f"arch {arch} not supported" + return config_map[arch] + + +def get_tokenizer_cls(arch: str): + assert arch in tokenizer_map, f"arch {arch} not supported" + return tokenizer_map[arch] + + +def get_config_parser(arch: str): + assert arch in config_map, f"arch {arch} not supported" + return config_map[arch] \ No newline at end of file diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/models/__init__.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8ed58a182b88d19fcfa7488d0fd9d3544c75102f --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/__init__.py @@ -0,0 +1,46 @@ +from transformers import LlamaForCausalLM,OPTForCausalLM + +from .llama import quantize_llama, estimate_ops_bits_bits_llama +from .opt import quantize_opt, estimate_ops_bits_bits_opt +from .qwen2 import quantize_qwen2, estimate_ops_bits_bits_qwen2 +from .phi import quantize_phi, estimate_ops_bits_bits_phi +from .phi3 import quantize_phi3, estimate_ops_bits_bits_phi3 +from .gemma2 import quantize_gemma2, estimate_ops_bits_bits_gemma2 + +def quantize_transformer(model, q_config, op_ids, granularity): + """ + granularity: str, one of ["decoder_layer", "gemm"] + """ + if model.config.architectures[0] == "LlamaForCausalLM": + return quantize_llama(model, q_config, op_ids, granularity) + elif model.config.architectures[0] == "OPTForCausalLM": + return quantize_opt(model, q_config, op_ids, granularity) + elif model.config.architectures[0] == "Qwen2ForCausalLM": + return quantize_qwen2(model, q_config, op_ids, granularity) + elif model.config.architectures[0] == "PhiForCausalLM": + return quantize_phi(model, q_config, op_ids, granularity) + elif model.config.architectures[0] == "Phi3ForCausalLM": + return quantize_phi3(model, q_config, op_ids, granularity) + elif model.config.architectures[0] == "Gemma2ForCausalLM": + return quantize_gemma2(model, q_config, op_ids, granularity) + else: + raise ValueError(f"Unkown model architecture {model.config.architectures[0]}") + + +def estimate_cost(model, q_config, seq_len): + if model.config.architectures[0] == "LlamaForCausalLM": + opts_bits_bits, raw = estimate_ops_bits_bits_llama(model, q_config, seq_len) + elif model.config.architectures[0] == "OPTForCausalLM": + opts_bits_bits, raw = estimate_ops_bits_bits_opt(model, q_config, seq_len) + elif model.config.architectures[0] == "Qwen2ForCausalLM": + opts_bits_bits, raw = estimate_ops_bits_bits_qwen2(model, q_config, seq_len) + elif model.config.architectures[0] == "PhiForCausalLM": + opts_bits_bits, raw = estimate_ops_bits_bits_phi(model, q_config, seq_len) + elif model.config.architectures[0] == "Phi3ForCausalLM": + opts_bits_bits, raw = estimate_ops_bits_bits_phi3(model, q_config, seq_len) + elif model.config.architectures[0] == "Gemma2ForCausalLM": + opts_bits_bits, raw = estimate_ops_bits_bits_gemma2(model, q_config, seq_len) + else: + raise ValueError(f"Unkown model architecture {model.config.architectures[0]}") + + return opts_bits_bits, raw diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/models/_llama_layers.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/_llama_layers.py new file mode 100644 index 0000000000000000000000000000000000000000..1a7a08d0934141bdcd2f2cf006f9bfcb50484037 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/_llama_layers.py @@ -0,0 +1,263 @@ +import os +import sys + + +from copy import deepcopy + + +sys.path.append( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "..", + "..", + "mase", + "machop", + ) +) + +from chop.models.manual.llama_quantized.modeling_llama import LlamaQuantizedDecoderLayer +from chop.models.manual.llama_quantized.configuration_llama import LlamaQuantizedConfig +from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaForCausalLM + + +def decoder_layer_quant_config_config_builder( + low_precision_config, + num_layers, + matmult_ids=None, +): + quant_cfg = {} + matmul_counter = 0 + + for layer_id in range(num_layers): + layer_entry = f"model_layer_{layer_id}" + quant_cfg[layer_entry] = { + "self_attn": { + "q_proj": dict(low_precision_config), + "k_proj": dict(low_precision_config), + "v_proj": dict(low_precision_config), + "o_proj": dict(low_precision_config), + "rotary_positional_encoding": None, + "matmul_0": dict(low_precision_config), + "matmul_1": dict(low_precision_config), + }, + "mlp": { + "gate_proj": dict(low_precision_config), + "down_proj": dict(low_precision_config), + "up_proj": dict(low_precision_config), + }, + } + quant_cfg["default"] = {**low_precision_config} + return quant_cfg + + +def llama_layer_flops_counter(layer: LlamaDecoderLayer, seq_len): + # count number of multiplies in a llama decoder layer + + flops = {} + flops_attn = {} + flops_mlp = {} + + k_proj = layer.self_attn.k_proj + flops_attn["k_proj"] = k_proj.in_features * k_proj.out_features * seq_len + + q_proj = layer.self_attn.q_proj + flops_attn["q_proj"] = q_proj.in_features * q_proj.out_features * seq_len + + v_proj = layer.self_attn.v_proj + flops_attn["v_proj"] = v_proj.in_features * v_proj.out_features * seq_len + + o_proj = layer.self_attn.o_proj + flops_attn["o_proj"] = o_proj.in_features * o_proj.out_features * seq_len + + # rotary_positional_encoding + flops_attn["rotary_positional_encoding"] = ( + k_proj.in_features * seq_len + q_proj.in_features * seq_len + ) + + # matmul_0 + flops_attn["matmul_0"] = seq_len * seq_len * k_proj.out_features + + # matmul_1 + flops_attn["matmul_1"] = seq_len * seq_len * k_proj.out_features + + # gate_proj + flops_mlp["gate_proj"] = ( + layer.mlp.gate_proj.in_features * layer.mlp.gate_proj.out_features * seq_len + ) + + # down_proj + flops_mlp["down_proj"] = ( + layer.mlp.down_proj.in_features * layer.mlp.down_proj.out_features * seq_len + ) + + # up_proj + flops_mlp["up_proj"] = ( + layer.mlp.up_proj.in_features * layer.mlp.up_proj.out_features * seq_len + ) + + # this estimation ignores the flops of element-wise multiply between gate and down_proj + + flops["attn"] = flops_attn + flops["mlp"] = flops_mlp + + return flops + + +def quantize_decoder_layer_llama( + model: LlamaForCausalLM, lp_config, layer_ids, seq_len=256 +): + config = model.config + q_config = decoder_layer_quant_config_config_builder( + lp_config["quantization"], config.num_hidden_layers + ) + config = LlamaQuantizedConfig.from_pretrained( + model.name_or_path, quant_config=q_config + ) + + layer_wise_ops = {} + + for m_name, m in model.named_modules(): + if isinstance(m, LlamaDecoderLayer): + layer_id = int(m_name.split(".")[-1]) + if layer_id in layer_ids: + new_m = LlamaQuantizedDecoderLayer(config=config, layer_id=layer_id) + new_m.to(next(iter(m.parameters())).device) + new_m.load_state_dict(m.state_dict(), strict=False) + + model.model.layers[layer_id] = new_m + else: + config.quant_config[f"model_layer_{layer_id}"] = { + "self_attn": { + "q_proj": None, + "k_proj": None, + "v_proj": None, + "o_proj": None, + "rotary_positional_encoding": None, + "matmul_0": None, + "matmul_1": None, + }, + "mlp": { + "gate_proj": None, + "down_proj": None, + "up_proj": None, + }, + } + + layer_wise_ops[m_name] = llama_layer_flops_counter(m, seq_len) + + model._no_split_modules = ["LlamaQuantizedDecoderLayer", "LlamaDecoderLayer"] + + return model, config, layer_wise_ops + + +def gemm_quant_config_builder( + low_precision_config, + num_layers, + matmult_ids=[], +): + q_config = {} + gemm_counter = 0 + for layer_id in range(num_layers): + layer_entry = f"model_layer_{layer_id}" + q_config[layer_entry] = {"self_attn": {}, "mlp": {}} + + for op in ["q_proj", "k_proj", "v_proj", "o_proj", "matmul_0", "matmul_1"]: + if gemm_counter in matmult_ids: + q_config[layer_entry]["self_attn"][op] = dict(low_precision_config) + else: + q_config[layer_entry]["self_attn"][op] = None + + gemm_counter += 1 + + if gemm_counter in matmult_ids: + q_config[layer_entry]["mlp"]["gate_proj"] = dict(low_precision_config) + else: + q_config[layer_entry]["mlp"]["gate_proj"] = None + gemm_counter += 1 + + if gemm_counter in matmult_ids: + q_config[layer_entry]["mlp"]["down_proj"] = dict(low_precision_config) + else: + q_config[layer_entry]["mlp"]["down_proj"] = None + gemm_counter += 1 + + if gemm_counter in matmult_ids: + q_config[layer_entry]["mlp"]["up_proj"] = dict(low_precision_config) + else: + q_config[layer_entry]["mlp"]["up_proj"] = None + gemm_counter += 1 + + q_config["default"] = {**low_precision_config} + return q_config + + +def quantize_gemms_llama(model: LlamaForCausalLM, lp_config, matmul_ids, seq_len=256): + config = model.config + q_config = gemm_quant_config_builder( + lp_config["quantization"], config.num_hidden_layers, matmul_ids + ) + config = LlamaQuantizedConfig.from_pretrained( + model.name_or_path, quant_config=q_config + ) + layer_wise_ops = {} + + for m_name, m in model.named_modules(): + if isinstance(m, LlamaDecoderLayer): + device = m.mlp.gate_proj.weight.device + layer_id = int(m_name.split(".")[-1]) + + new_m = LlamaQuantizedDecoderLayer(config=config, layer_id=layer_id) + new_m.to(next(iter(m.parameters())).device) + new_m.load_state_dict(m.state_dict(), strict=False) + new_m.to(device) + + model.model.layers[layer_id] = new_m + + if isinstance(m, (LlamaDecoderLayer, LlamaQuantizedDecoderLayer)): + layer_wise_ops[m_name] = llama_layer_flops_counter(m, seq_len) + + model._no_split_modules = ["LlamaQuantizedDecoderLayer", "LlamaDecoderLayer"] + + return model, config, layer_wise_ops + + +def _get_opts(ops, config): + if config is None: + return ops * 16 * 16 + return ops * config["weight_width"] * config["data_in_width"] + + +def cost_builder_llama(model, ops, quan_config): + total_ops = 0 + for n, m in model.named_modules(): + if isinstance(m, (LlamaQuantizedDecoderLayer, LlamaDecoderLayer)): + layer_id = int(n.split(".")[-1]) + layer_config = quan_config[f"model_layer_{layer_id}"] + + layer_ops = ops[n] + + for n, nm in layer_config["self_attn"].items(): + total_ops += _get_opts(layer_ops["attn"][n], nm) + + total_ops += _get_opts( + layer_ops["mlp"]["gate_proj"], layer_config["mlp"]["gate_proj"] + ) + total_ops += _get_opts( + layer_ops["mlp"]["down_proj"], layer_config["mlp"]["down_proj"] + ) + total_ops += _get_opts( + layer_ops["mlp"]["up_proj"], layer_config["mlp"]["up_proj"] + ) + + return total_ops + + +def quantize_llama(model, config, ids, seq_len): + granularity = config["setup"]["granularity"] + + if granularity in ["transformer_layer"]: + return quantize_decoder_layer_llama(model, config, ids, seq_len) + elif granularity in ["matmult"]: + return quantize_gemms_llama(model, config, ids, seq_len) + else: + raise ValueError(f"Unkown setup granularity {granularity}") diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/models/_opt_layers.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/_opt_layers.py new file mode 100644 index 0000000000000000000000000000000000000000..640ec77386e7440b6f6d9f707a4b08267fb9dbd8 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/_opt_layers.py @@ -0,0 +1,235 @@ +# force to enable import chop +import os +import sys +from copy import deepcopy + + +sys.path.append( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "..", + "..", + "mase", + "machop", + ) +) + +from chop.models.manual.opt_quantized.configuration_opt import OPTQuantizedConfig +from chop.models.manual.opt_quantized.modeling_opt import OPTQuantizedDecoderLayer +from transformers.models.opt.modeling_opt import OPTDecoderLayer + + +def layer_config_builder(low_precision_config, num_layers, matmult_ids=None): + quant_config = {} + matmult_counter = 0 + for layer_id in range(num_layers): + layer_entry = f"model_layer_{layer_id}" + quant_config[layer_entry] = { + "self_attn": { + "q_proj": dict(low_precision_config), + "k_proj": dict(low_precision_config), + "v_proj": dict(low_precision_config), + "out_proj": dict(low_precision_config), + "bmm_0": dict(low_precision_config), + "bmm_1": dict(low_precision_config), + }, + "fc1": dict(low_precision_config), + "fc2": dict(low_precision_config), + } + quant_config["default"] = {**low_precision_config} + + return quant_config + + +def weight_replacement(x, y): + target_state_dict = deepcopy(x.state_dict()) + y.load_state_dict(target_state_dict) + return y + + +def opt_layer_flops_counter(layer, seq_len): + # assume single batch + flops = {} + flops_attn = {} + + """ + for a linear layer with dim in_features * out_features + if input is in size (seq_len, in_features) + we have in total seq_len * in_features * out_features multiplications + """ + + # k_proj + k_proj = layer.self_attn.k_proj + flops_attn["k_proj"] = k_proj.in_features * k_proj.out_features * seq_len + # v_proj + v_proj = layer.self_attn.v_proj + flops_attn["v_proj"] = v_proj.in_features * v_proj.out_features * seq_len + # q_proj + q_proj = layer.self_attn.q_proj + flops_attn["q_proj"] = q_proj.in_features * q_proj.out_features * seq_len + # out_proj + out_proj = layer.self_attn.out_proj + flops_attn["out_proj"] = out_proj.in_features * out_proj.out_features * seq_len + + """ + the first bmm is for attention weights, this is achieved by Q^TK + """ + flops_attn["bmm_0"] = seq_len * seq_len * q_proj.out_features + """ + the second bmm is to use the attention weights to re-scale V + """ + flops_attn["bmm_1"] = seq_len * seq_len * v_proj.out_features + + # fc1 + fc1 = layer.fc1 + flops["fc1"] = fc1.in_features * fc1.out_features * seq_len + + # fc2 + fc2 = layer.fc2 + flops["fc2"] = fc2.in_features * fc2.out_features * seq_len + flops["self_attn"] = flops_attn + return flops + + +def quantize_decoder_layer_opt(model, lp_config, layer_ids, seq_len=256): + # refactor config + config = model.config + quant_config = layer_config_builder( + lp_config["quantization"], config.num_hidden_layers + ) + config = OPTQuantizedConfig.from_pretrained( + model.name_or_path, quant_config=quant_config + ) + + layer_wise_ops = {} + # module level traverse + for n, m in model.named_modules(): + if isinstance(m, OPTDecoderLayer): + # model.decoder.layers.0 is the name for first layer + # brute force slice + layer_id = int(n.split(".")[-1]) + + if layer_id in layer_ids: + # copy value + original = model.model.decoder.layers[layer_id] + new = OPTQuantizedDecoderLayer(config, layer_id=layer_id) + new.to(next(iter(new.parameters())).device) + new = weight_replacement(original, new) + # swap + model.model.decoder.layers[layer_id] = new + else: + config.quant_config[f"model_layer_{layer_id}"] = { + "self_attn": { + "q_proj": None, + "k_proj": None, + "v_proj": None, + "out_proj": None, + "bmm_0": None, + "bmm_1": None, + }, + "fc1": None, + "fc2": None, + } + layer_wise_ops[n] = opt_layer_flops_counter(m, seq_len) + # you can stop here and verify that the model is replaced + + model._no_split_modules = ["OPTQuantizedDecoderLayer", "OPTDecoderLayer"] + + return model, config, layer_wise_ops + + +def fine_grained_layer_config_builder(low_precision_config, num_layers, matmult_ids=[]): + quant_config = {} + matmult_counter = 0 + for layer_id in range(num_layers): + layer_entry = f"model_layer_{layer_id}" + quant_config[layer_entry] = {"self_attn": {}} + for op_idx, op in enumerate( + ["q_proj", "k_proj", "v_proj", "out_proj", "bmm_0", "bmm_1"] + ): + if matmult_counter in matmult_ids: + assert matmult_counter == layer_id * 8 + op_idx + quant_config[layer_entry]["self_attn"][op] = dict(low_precision_config) + else: + quant_config[layer_entry]["self_attn"][op] = None + + matmult_counter += 1 + + if matmult_counter in matmult_ids: + op_idx = 6 + assert matmult_counter == layer_id * 8 + op_idx + quant_config[layer_entry]["fc1"] = dict(low_precision_config) + else: + quant_config[layer_entry]["fc1"] = None + matmult_counter += 1 + + if matmult_counter in matmult_ids: + op_idx = 7 + assert matmult_counter == layer_id * 8 + op_idx + quant_config[layer_entry]["fc2"] = dict(low_precision_config) + else: + quant_config[layer_entry]["fc2"] = None + matmult_counter += 1 + quant_config["default"] = {**low_precision_config} + return quant_config + + +def quantize_gemms_opt(model, lp_config, matmult_ids, seq_len): + config = model.config + quant_config = fine_grained_layer_config_builder( + lp_config["quantization"], config.num_hidden_layers, matmult_ids + ) + config = OPTQuantizedConfig.from_pretrained( + model.name_or_path, quant_config=quant_config + ) + layer_wise_ops = {} + # module level traverse + for n, m in model.named_modules(): + if isinstance(m, OPTDecoderLayer): + device = m.self_attn.q_proj.weight.device + layer_id = int(n.split(".")[-1]) + original = model.model.decoder.layers[layer_id] + new = OPTQuantizedDecoderLayer(config, layer_id=layer_id) + new.to(next(iter(new.parameters())).device) + new = weight_replacement(original, new) + # swap + new.to(device) + model.model.decoder.layers[layer_id] = new + # model.decoder.layers.0 is the name for first layer + # brute force slice + if isinstance(m, (OPTDecoderLayer, OPTQuantizedDecoderLayer)): + layer_wise_ops[n] = opt_layer_flops_counter(m, seq_len) + model._no_split_modules = ["OPTQuantizedDecoderLayer", "OPTDecoderLayer"] + return model, config, layer_wise_ops + + +def get_ops(ops, config): + if config is None: + return ops * 16 * 16 + return ops * config["weight_width"] * config["data_in_width"] + + +def cost_builder_opt(model, ops, quan_config): + total_ops = 0 + for n, m in model.named_modules(): + if isinstance(m, (OPTDecoderLayer, OPTQuantizedDecoderLayer)): + layer_id = int(n.split(".")[-1]) + layer_n = f"model_layer_{layer_id}" + + layer_quan = quan_config[layer_n] + layer_ops = ops[n] + for n, nm in layer_quan["self_attn"].items(): + total_ops += get_ops(layer_ops["self_attn"][n], nm) + total_ops += get_ops(layer_ops["fc1"], layer_quan["fc1"]) + total_ops += get_ops(layer_ops["fc2"], layer_quan["fc2"]) + return total_ops + + +def quantize_opt(model, config, ids, seq_len): + granularity = config["setup"]["granularity"] + if granularity in ["transformer_layer"]: + return quantize_decoder_layer_opt(model, config, ids, seq_len) + elif granularity in ["matmult"]: + return quantize_gemms_opt(model, config, ids, seq_len) + else: + raise ValueError(f"Unkown setup granularity {granularity}") diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/models/gemma2.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/gemma2.py new file mode 100644 index 0000000000000000000000000000000000000000..cddbc1db7be45314a33bcdda71e06887e75c31a6 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/gemma2.py @@ -0,0 +1,472 @@ +import logging +from typing import Optional, Tuple +import math +from copy import deepcopy + +import torch +from torch import nn +from transformers.models.gemma2.modeling_gemma2 import ( + Gemma2RMSNorm, + Gemma2RotaryEmbedding, + apply_rotary_pos_emb, + ACT2FN, + Gemma2Config, + Cache, + repeat_kv, + Gemma2ForCausalLM, + Gemma2DecoderLayer, +) + +from ..quantize import qnn +from ..quantize import qF +from .utils import find_matched_pattern, get_layer_name + +logger = logging.getLogger(__name__) +logger.propagate = False + + +class Gemma2QuantizedMLP(nn.Module): + def __init__(self, config, q_config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = qnn.QLinear(self.hidden_size, self.intermediate_size, bias=False, q_config=q_config["gate_proj"]) + self.up_proj = qnn.QLinear(self.hidden_size, self.intermediate_size, bias=False, q_config=q_config["up_proj"]) + self.down_proj = qnn.QLinear(self.intermediate_size, self.hidden_size, bias=False, q_config=q_config["down_proj"]) + self.act_fn = ACT2FN[config.hidden_activation] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +class Gemma2QuantizedAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None, q_config: Optional[dict] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = config.head_dim + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.scaling = config.query_pre_attn_scalar**-0.5 + self.q_config = q_config + + if self.hidden_size % self.num_heads != 0: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = qnn.QLinear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias, q_config=config.q_config["q_proj"]) + self.k_proj = qnn.QLinear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias, q_config=config.q_config["k_proj"]) + self.v_proj = qnn.QLinear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias, q_config=config.q_config["v_proj"]) + self.o_proj = qnn.QLinear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias, q_config=config.q_config["o_proj"]) + self.rotary_emb = Gemma2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = { + "sin": sin, + "cos": cos, + "sliding_window": self.sliding_window, + "cache_position": cache_position, + } + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + # attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling + attn_weights = qF.q_matmul(query_states, key_states.transpose(2, 3),q_config=self.q_config["matmult_0"]) * self.scaling + + + if self.config.attn_logit_softcapping is not None: + attn_weights = attn_weights / self.config.attn_logit_softcapping + attn_weights = torch.tanh(attn_weights) + attn_weights = attn_weights * self.config.attn_logit_softcapping + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + # attn_output = torch.matmul(attn_weights, value_states) + attn_output = qF.q_matmul(attn_weights, value_states, q_config = self.q_config["matmult_1"]) + + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + if attn_output.size() == (self.num_heads, q_len, self.head_dim): + attn_output = attn_output.unsqueeze(0) # add batch dimension for bsz=1 + else: + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.view(bsz, q_len, -1) + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +GEMMA2_ATTENTION_CLASSES = { + "eager": Gemma2QuantizedAttention, +} + +class Gemma2QuantizedDecoderLayer(nn.Module): + def __init__(self, config: Gemma2Config, layer_idx: int, q_config: Optional[dict] = None): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + + self.self_attn = GEMMA2_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx, q_config=q_config["self_attn"]) + + self.mlp = Gemma2QuantizedMLP(config, q_config=q_config["mlp"]) + self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.is_sliding = not bool(layer_idx % 2) + self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.sliding_window = config.sliding_window + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if self.is_sliding and attention_mask is not None: # efficient SDPA and no padding + # Flash-attn is a 2D tensor + if self.config._attn_implementation == "flash_attention_2": + if past_key_value is not None: # when decoding + attention_mask = attention_mask[:, -self.sliding_window :] + else: + min_dtype = torch.finfo(hidden_states.dtype).min + sliding_window_mask = torch.tril( + torch.ones_like(attention_mask, dtype=torch.bool), diagonal=-self.sliding_window + ) + attention_mask = torch.where(sliding_window_mask, min_dtype, attention_mask) + if attention_mask.shape[-1] <= 1: # when decoding + attention_mask = attention_mask[:, :, :, -self.sliding_window :] + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.pre_feedforward_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_feedforward_layernorm(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + + +def build_layer_q_config_gemma2(q_config: dict, num_layers: int, layer_ids: list[int]): + full_q_config = {} + + if "linear" in q_config and "matmul" in q_config: + linear_q_config = q_config["linear"] + gemm_q_config = q_config["matmul"] + elif "x" in q_config and "w" in q_config: + linear_q_config = q_config + gemm_q_config = q_config + else: + linear_q_config = dict(x=q_config, w=q_config) + gemm_q_config = dict(x=q_config, w=q_config) + + for layer_id in range(num_layers): + # quantize selected layer + layer_name = f"model_layer_{layer_id}" + # quantize selected layer + + if layer_id in layer_ids: + full_q_config[layer_name] = { + "self_attn": { + "q_proj": linear_q_config, + "k_proj": linear_q_config, + "v_proj": linear_q_config, + "o_proj": linear_q_config, + "matmul_0": gemm_q_config, + "matmul_1": gemm_q_config, + }, + "mlp": { + "gate_proj": linear_q_config, + "down_proj": linear_q_config, + "up_proj": linear_q_config, + }, + } + else: + full_q_config[layer_name] = { + "self_attn": { + "q_proj": None, + "k_proj": None, + "v_proj": None, + "o_proj": None, + "matmul_0": None, + "matmul_1": None, + }, + "mlp": { + "gate_proj": None, + "down_proj": None, + "up_proj": None, + }, + } + + return full_q_config + + +def quantize_decoder_layers_gemma2(model: Gemma2ForCausalLM, q_config: dict, layer_ids): + q_config = build_layer_q_config_gemma2(q_config, model.config.num_hidden_layers, layer_ids) + + for layer_id, ori_decoder_layer in enumerate(model.model.layers): + layer_entry = f"model_layer_{layer_id}" + layer_q_config = q_config[layer_entry] + + if layer_id in layer_ids: + new_decoder_layer = Gemma2QuantizedDecoderLayer(model.config, layer_id, layer_q_config) + ori_rope = ori_decoder_layer.self_attn.rotary_emb + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).dtype) + new_decoder_layer.self_attn.rotary_emb = ori_rope + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).device) + new_decoder_layer.load_state_dict(ori_decoder_layer.state_dict(), strict=False) + model.model.layers[layer_id] = new_decoder_layer + + else: + continue + + model._no_split_modules = ["Gemma2QuantizedDecoderLayer", "Gemma2DecoderLayer"] + return model, q_config + + +def build_gemm_q_config_gemma2(q_config: dict, num_layers: int, matmul_ids: list[int]): + full_q_config = {} + + if "linear" in q_config and "matmul" in q_config: + linear_q_config = q_config["linear"] + gemm_q_config = q_config["matmul"] + elif "x" in q_config and "w" in q_config: + linear_q_config = q_config + gemm_q_config = q_config + else: + linear_q_config = dict(x=q_config, w=q_config) + gemm_q_config = dict(x=q_config, w=q_config) + + gemm_counter = 0 + + for layer_id in range(num_layers): + layer_entry = f"model_layer_{layer_id}" + full_q_config[layer_entry] = {"self_attn": {}, "mlp": {}} + + for op in ["q_proj", "k_proj", "v_proj", "o_proj", "matmul_0", "matmul_1"]: + if gemm_counter in matmul_ids: + full_q_config[layer_entry]["self_attn"][op] = linear_q_config if "matmul" in op else gemm_q_config + else: + full_q_config[layer_entry]["self_attn"][op] = None + + gemm_counter += 1 + + for op in ["gate_proj", "down_proj", "up_proj"]: + if gemm_counter in matmul_ids: + full_q_config[layer_entry]["mlp"][op] = linear_q_config + else: + full_q_config[layer_entry]["mlp"][op] = None + + gemm_counter += 1 + + return full_q_config + + +def quantize_gemms_gemma2(model: Gemma2ForCausalLM, q_config: dict, matmul_ids: list[int]): + q_config = build_gemm_q_config_gemma2(q_config, model.config.num_hidden_layers, matmul_ids) + + for layer_id, ori_decoder_layer in enumerate(model.model.layers): + layer_entry = f"model_layer_{layer_id}" + layer_q_config = q_config[layer_entry] + + if layer_id in matmul_ids: + new_decoder_layer = Gemma2QuantizedDecoderLayer(model.config, layer_id, layer_q_config) + ori_rope = ori_decoder_layer.self_attn.rotary_emb + new_decoder_layer.load_state_dict(ori_decoder_layer.state_dict(), strict=False) + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).dtype) + new_decoder_layer.self_attn.rotary_emb = ori_rope + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).device) + model.model.layers[layer_id] = new_decoder_layer + + else: + continue + + model._no_split_modules = ["LlamaQuantizedDecoderLayer", "LlamaDecoderLayer"] + return model, q_config + + +def quantize_gemma2(model: Gemma2ForCausalLM, q_config: dict, op_ids: list[int], granularity: str): + """ + op_ids: if granularity is "layer", op_ids is the list of layer ids to quantize; if granularity is "gemm", op_ids is the list of gemm ids to quantize + """ + if granularity == "decoder_layer": + return quantize_decoder_layers_gemma2(model, q_config, op_ids) + elif granularity == "gemm": + return quantize_gemms_gemma2(model, q_config, op_ids) + else: + raise ValueError(f"Unknown granularity {granularity}") + + +def estimate_ops_bits_bits_gemma2(model, q_config, seq_len): + ops = {} + x_bits = {} + w_bits = {} + is_quantized = {} + + def is_bypass(q_config, layer_name, parent_op, op): + if q_config is None: + return True + if q_config[layer_name][parent_op][op] is None: + return True + if ( + q_config[layer_name][parent_op][op]["x"]["name"] == "bypass" + and q_config[layer_name][parent_op][op]["w"]["name"] == "bypass" + ): + return True + return False + + for i in range(model.config.num_hidden_layers): + layer_name = f"model_layer_{i}" + layer = model.model.layers[i] + for op in ["q_proj", "k_proj", "v_proj", "o_proj", "matmul_0", "matmul_1"]: + if op in ["q_proj", "k_proj", "v_proj", "o_proj"]: + problem_shape = ( + seq_len, + getattr(layer.self_attn, op).in_features, + getattr(layer.self_attn, op).out_features, + ) + else: + problem_shape = ( + seq_len, + model.config.hidden_size // model.config.num_attention_heads, + seq_len, + ) + ops[f"{layer_name}.self_attn.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + + if not is_bypass(q_config, layer_name, "self_attn", op): + is_quantized[f"{layer_name}.self_attn.{op}"] = True + ops[f"{layer_name}.self_attn.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + if q_config[layer_name]["self_attn"][op]["x"]["name"] == "bypass": + x_bits[f"{layer_name}.self_attn.{op}"] = 16 + else: + x_bits[f"{layer_name}.self_attn.{op}"] = q_config[layer_name]["self_attn"][op]["x"]["width"] + + if q_config[layer_name]["self_attn"][op]["w"]["name"] == "bypass": + w_bits[f"{layer_name}.self_attn.{op}"] = 16 + else: + w_bits[f"{layer_name}.self_attn.{op}"] = q_config[layer_name]["self_attn"][op]["w"]["width"] + else: + is_quantized[f"{layer_name}.{op}"] = False + x_bits[f"{layer_name}.self_attn.{op}"] = 16 + w_bits[f"{layer_name}.self_attn.{op}"] = 16 + + for op in ["gate_proj", "down_proj", "up_proj"]: + problem_shape = (seq_len, model.config.hidden_size, model.config.intermediate_size) + + ops[f"{layer_name}.mlp.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + + # if q_config is not None and q_config[layer_name]["mlp"][op] is not None: + if not is_bypass(q_config, layer_name, "mlp", op): + is_quantized[f"{layer_name}.mlp.{op}"] = True + ops[f"{layer_name}.mlp.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + if q_config[layer_name]["mlp"][op]["x"]["name"] == "bypass": + x_bits[f"{layer_name}.mlp.{op}"] = 16 + else: + x_bits[f"{layer_name}.mlp.{op}"] = q_config[layer_name]["mlp"][op]["x"]["width"] + + if q_config[layer_name]["mlp"][op]["w"]["name"] == "bypass": + w_bits[f"{layer_name}.mlp.{op}"] = 16 + else: + w_bits[f"{layer_name}.mlp.{op}"] = q_config[layer_name]["mlp"][op]["w"]["width"] + else: + is_quantized[f"{layer_name}.mlp.{op}"] = False + x_bits[f"{layer_name}.mlp.{op}"] = 16 + w_bits[f"{layer_name}.mlp.{op}"] = 16 + + opts_bits_bits = 0 + for op in ops: + opts_bits_bits += ops[op] * x_bits[op] * w_bits[op] + + raw = dict(ops=ops, bits=x_bits, is_quantized=is_quantized) + + return opts_bits_bits, raw diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/models/llama.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/llama.py new file mode 100644 index 0000000000000000000000000000000000000000..cc912437f24bae56bcedeaf0ad69d347603b9d46 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/llama.py @@ -0,0 +1,502 @@ +import logging +from typing import Optional, Tuple +import math +from copy import deepcopy + +import torch +from torch import nn +from transformers.models.llama.modeling_llama import ( + LlamaRMSNorm, + LlamaRotaryEmbedding, + LlamaLinearScalingRotaryEmbedding, + LlamaDynamicNTKScalingRotaryEmbedding, + apply_rotary_pos_emb, + ACT2FN, + LlamaConfig, + Cache, + repeat_kv, + LlamaForCausalLM, + LlamaDecoderLayer, +) + +from ..quantize import qnn +from ..quantize import qF +from .utils import find_matched_pattern, get_layer_name + +logger = logging.getLogger(__name__) +logger.propagate = False + + +class LlamaQuantizedMLP(nn.Module): + def __init__(self, config: LlamaConfig, q_config: dict): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + # self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + # self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + # self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + # fmt: off + self.gate_proj = qnn.QLinear(self.hidden_size, self.intermediate_size, bias=False, q_config=q_config["gate_proj"]) + self.up_proj = qnn.QLinear(self.hidden_size, self.intermediate_size, bias=False, q_config=q_config["up_proj"]) + self.down_proj = qnn.QLinear(self.intermediate_size, self.hidden_size, bias=False, q_config=q_config["down_proj"]) + # fmt: on + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + if self.config.pretraining_tp > 1: + raise NotImplementedError("Pretraining with TP > 1 is not supported yet.") + else: + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + return down_proj + + +class LlamaQuantizedAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: LlamaConfig, layer_idx: Optional[int], q_config: dict): + super().__init__() + self.config = config + self.layer_idx = layer_idx + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + # self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + # self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + # self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + # self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias) + # fmt: off + self.q_proj = qnn.QLinear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias, q_config=q_config["q_proj"]) + self.k_proj = qnn.QLinear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias, q_config=q_config["k_proj"]) + self.v_proj = qnn.QLinear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias, q_config=q_config["v_proj"]) + self.o_proj = qnn.QLinear(self.hidden_size, self.hidden_size, bias=config.attention_bias, q_config=q_config["o_proj"]) + self.q_config = q_config + # fmt: on + + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = LlamaRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = LlamaLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + if self.config.pretraining_tp > 1: + raise NotImplementedError("Pretraining with TP > 1 is not supported yet.") + + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + # *: matmul + # attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + # fmt: off + attn_weights = qF.q_matmul(query_states, key_states.transpose(2, 3), q_config=self.q_config["matmul_0"]) / math.sqrt(self.head_dim) + # fmt: on + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + # *: matmul + # attn_output = torch.matmul(attn_weights, value_states) + attn_output = qF.q_matmul(attn_weights, value_states, q_config=self.q_config["matmul_1"]) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + if self.config.pretraining_tp > 1: + raise NotImplementedError("Pretraining with TP > 1 is not supported yet.") + else: + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +LLAMA_ATTENTION_CLASSES = { + "eager": LlamaQuantizedAttention, +} + + +class LlamaQuantizedDecoderLayer(nn.Module): + def __init__(self, config: LlamaConfig, layer_idx: int, q_config: dict): + super().__init__() + self.hidden_size = config.hidden_size + + assert config._attn_implementation == "eager", "Only eager attention is supported." + self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation]( + config=config, layer_idx=layer_idx, q_config=q_config["self_attn"] + ) + + self.mlp = LlamaQuantizedMLP(config, q_config=q_config["mlp"]) + self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +def build_layer_q_config_llama(q_config: dict, num_layers: int, layer_ids: list[int]): + full_q_config = {} + + if "linear" in q_config and "matmul" in q_config: + linear_q_config = q_config["linear"] + gemm_q_config = q_config["matmul"] + elif "x" in q_config and "w" in q_config: + linear_q_config = q_config + gemm_q_config = q_config + else: + linear_q_config = dict(x=q_config, w=q_config) + gemm_q_config = dict(x=q_config, w=q_config) + + for layer_id in range(num_layers): + # quantize selected layer + layer_name = f"model_layer_{layer_id}" + # quantize selected layer + + if layer_id in layer_ids: + full_q_config[layer_name] = { + "self_attn": { + "q_proj": linear_q_config, + "k_proj": linear_q_config, + "v_proj": linear_q_config, + "o_proj": linear_q_config, + "matmul_0": gemm_q_config, + "matmul_1": gemm_q_config, + }, + "mlp": { + "gate_proj": linear_q_config, + "down_proj": linear_q_config, + "up_proj": linear_q_config, + }, + } + else: + full_q_config[layer_name] = { + "self_attn": { + "q_proj": None, + "k_proj": None, + "v_proj": None, + "o_proj": None, + "matmul_0": None, + "matmul_1": None, + }, + "mlp": { + "gate_proj": None, + "down_proj": None, + "up_proj": None, + }, + } + + return full_q_config + + +def quantize_decoder_layers_llama(model: LlamaForCausalLM, q_config: dict, layer_ids): + q_config = build_layer_q_config_llama(q_config, model.config.num_hidden_layers, layer_ids) + + for layer_id, ori_decoder_layer in enumerate(model.model.layers): + layer_entry = f"model_layer_{layer_id}" + layer_q_config = q_config[layer_entry] + + if layer_id in layer_ids: + new_decoder_layer = LlamaQuantizedDecoderLayer(model.config, layer_id, layer_q_config) + ori_rope = ori_decoder_layer.self_attn.rotary_emb + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).dtype) + new_decoder_layer.self_attn.rotary_emb = ori_rope + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).device) + new_decoder_layer.load_state_dict(ori_decoder_layer.state_dict(), strict=False) + model.model.layers[layer_id] = new_decoder_layer + + else: + continue + + model._no_split_modules = ["LlamaQuantizedDecoderLayer", "LlamaDecoderLayer"] + return model, q_config + + +def build_gemm_q_config_llama(q_config: dict, num_layers: int, matmul_ids: list[int]): + full_q_config = {} + + if "linear" in q_config and "matmul" in q_config: + linear_q_config = q_config["linear"] + gemm_q_config = q_config["matmul"] + elif "x" in q_config and "w" in q_config: + linear_q_config = q_config + gemm_q_config = q_config + else: + linear_q_config = dict(x=q_config, w=q_config) + gemm_q_config = dict(x=q_config, w=q_config) + + gemm_counter = 0 + + for layer_id in range(num_layers): + layer_entry = f"model_layer_{layer_id}" + full_q_config[layer_entry] = {"self_attn": {}, "mlp": {}} + + for op in ["q_proj", "k_proj", "v_proj", "o_proj", "matmul_0", "matmul_1"]: + if gemm_counter in matmul_ids: + full_q_config[layer_entry]["self_attn"][op] = linear_q_config if "matmul" in op else gemm_q_config + else: + full_q_config[layer_entry]["self_attn"][op] = None + + gemm_counter += 1 + + for op in ["gate_proj", "down_proj", "up_proj"]: + if gemm_counter in matmul_ids: + full_q_config[layer_entry]["mlp"][op] = linear_q_config + else: + full_q_config[layer_entry]["mlp"][op] = None + + gemm_counter += 1 + + return full_q_config + + +def quantize_gemms_llama(model: LlamaForCausalLM, q_config: dict, matmul_ids: list[int]): + q_config = build_gemm_q_config_llama(q_config, model.config.num_hidden_layers, matmul_ids) + + for layer_id, ori_decoder_layer in enumerate(model.model.layers): + layer_entry = f"model_layer_{layer_id}" + layer_q_config = q_config[layer_entry] + + if layer_id in matmul_ids: + new_decoder_layer = LlamaQuantizedDecoderLayer(model.config, layer_id, layer_q_config) + ori_rope = ori_decoder_layer.self_attn.rotary_emb + new_decoder_layer.load_state_dict(ori_decoder_layer.state_dict(), strict=False) + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).dtype) + new_decoder_layer.self_attn.rotary_emb = ori_rope + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).device) + model.model.layers[layer_id] = new_decoder_layer + + else: + continue + + model._no_split_modules = ["LlamaQuantizedDecoderLayer", "LlamaDecoderLayer"] + return model, q_config + + +def quantize_llama(model: LlamaForCausalLM, q_config: dict, op_ids: list[int], granularity: str): + """ + op_ids: if granularity is "layer", op_ids is the list of layer ids to quantize; if granularity is "gemm", op_ids is the list of gemm ids to quantize + """ + if granularity == "decoder_layer": + return quantize_decoder_layers_llama(model, q_config, op_ids) + elif granularity == "gemm": + return quantize_gemms_llama(model, q_config, op_ids) + else: + raise ValueError(f"Unknown granularity {granularity}") + + +def estimate_ops_bits_bits_llama(model, q_config, seq_len): + ops = {} + x_bits = {} + w_bits = {} + is_quantized = {} + + def is_bypass(q_config, layer_name, parent_op, op): + if q_config is None: + return True + if q_config[layer_name][parent_op][op] is None: + return True + if ( + q_config[layer_name][parent_op][op]["x"]["name"] == "bypass" + and q_config[layer_name][parent_op][op]["w"]["name"] == "bypass" + ): + return True + return False + + for i in range(model.config.num_hidden_layers): + layer_name = f"model_layer_{i}" + layer = model.model.layers[i] + for op in ["q_proj", "k_proj", "v_proj", "o_proj", "matmul_0", "matmul_1"]: + if op in ["q_proj", "k_proj", "v_proj", "o_proj"]: + problem_shape = ( + seq_len, + getattr(layer.self_attn, op).in_features, + getattr(layer.self_attn, op).out_features, + ) + else: + problem_shape = ( + seq_len, + model.config.hidden_size // model.config.num_attention_heads, + seq_len, + ) + ops[f"{layer_name}.self_attn.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + + if not is_bypass(q_config, layer_name, "self_attn", op): + is_quantized[f"{layer_name}.self_attn.{op}"] = True + ops[f"{layer_name}.self_attn.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + if q_config[layer_name]["self_attn"][op]["x"]["name"] == "bypass": + x_bits[f"{layer_name}.self_attn.{op}"] = 16 + else: + x_bits[f"{layer_name}.self_attn.{op}"] = q_config[layer_name]["self_attn"][op]["x"]["width"] + + if q_config[layer_name]["self_attn"][op]["w"]["name"] == "bypass": + w_bits[f"{layer_name}.self_attn.{op}"] = 16 + else: + w_bits[f"{layer_name}.self_attn.{op}"] = q_config[layer_name]["self_attn"][op]["w"]["width"] + else: + is_quantized[f"{layer_name}.{op}"] = False + x_bits[f"{layer_name}.self_attn.{op}"] = 16 + w_bits[f"{layer_name}.self_attn.{op}"] = 16 + + for op in ["gate_proj", "down_proj", "up_proj"]: + problem_shape = (seq_len, model.config.hidden_size, model.config.intermediate_size) + + ops[f"{layer_name}.mlp.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + + # if q_config is not None and q_config[layer_name]["mlp"][op] is not None: + if not is_bypass(q_config, layer_name, "mlp", op): + is_quantized[f"{layer_name}.mlp.{op}"] = True + ops[f"{layer_name}.mlp.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + if q_config[layer_name]["mlp"][op]["x"]["name"] == "bypass": + x_bits[f"{layer_name}.mlp.{op}"] = 16 + else: + x_bits[f"{layer_name}.mlp.{op}"] = q_config[layer_name]["mlp"][op]["x"]["width"] + + if q_config[layer_name]["mlp"][op]["w"]["name"] == "bypass": + w_bits[f"{layer_name}.mlp.{op}"] = 16 + else: + w_bits[f"{layer_name}.mlp.{op}"] = q_config[layer_name]["mlp"][op]["w"]["width"] + else: + is_quantized[f"{layer_name}.mlp.{op}"] = False + x_bits[f"{layer_name}.mlp.{op}"] = 16 + w_bits[f"{layer_name}.mlp.{op}"] = 16 + + opts_bits_bits = 0 + for op in ops: + opts_bits_bits += ops[op] * x_bits[op] * w_bits[op] + + raw = dict(ops=ops, bits=x_bits, is_quantized=is_quantized) + + return opts_bits_bits, raw diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/models/opt.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/opt.py new file mode 100644 index 0000000000000000000000000000000000000000..351de7c34eb49cd4362e658454612a758912d7c5 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/opt.py @@ -0,0 +1,524 @@ +import logging +from typing import Optional, Tuple +import math +from copy import deepcopy + +import torch +from torch import nn + +from transformers.models.opt.modeling_opt import ( + OPTForCausalLM, + OPTConfig, + OPTLearnedPositionalEmbedding, + OPTDecoderLayer, + ACT2FN, +) + +from ..quantize import qnn +from ..quantize import qF +from .utils import find_matched_pattern, get_layer_name + +logger = logging.getLogger(__name__) +logger.propagate = False + + +class OPTQuantizedAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + config: OPTConfig, + is_decoder: bool = False, + q_config: dict = None, + **kwargs, + ): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.dropout = config.attention_dropout + self.enable_bias = config.enable_bias + + self.head_dim = self.embed_dim // self.num_heads + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {self.num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + + # self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias) + # self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias) + # self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias) + # self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias) + + self.k_proj = qnn.QLinear(self.embed_dim, self.embed_dim, bias=self.enable_bias, q_config=q_config["k_proj"]) + self.v_proj = qnn.QLinear(self.embed_dim, self.embed_dim, bias=self.enable_bias, q_config=q_config["v_proj"]) + self.q_proj = qnn.QLinear(self.embed_dim, self.embed_dim, bias=self.enable_bias, q_config=q_config["q_proj"]) + self.out_proj = qnn.QLinear( + self.embed_dim, self.embed_dim, bias=self.enable_bias, q_config=q_config["out_proj"] + ) + self.q_config = q_config + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + # attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + # attn_weights = torch.matmul(query_states, key_states.transpose(1, 2)) + attn_weights = qF.q_matmul(query_states, key_states.transpose(1, 2), q_config=self.q_config["matmul_0"]) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = torch.max( + attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device) + ) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437 + if attn_weights.dtype == torch.float16: + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(torch.float16) + else: + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + # attn_output = torch.bmm(attn_probs, value_states) + # attn_output = torch.matmul(attn_probs, value_states) + attn_output = qF.q_matmul(attn_probs, value_states, q_config=self.q_config["matmul_1"]) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +OPT_ATTENTION_CLASSES = { + "eager": OPTQuantizedAttention, +} + + +class OPTQuantizedDecoderLayer(nn.Module): + def __init__(self, config: OPTConfig, q_config: dict): + super().__init__() + self.embed_dim = config.hidden_size + + self.self_attn = OPT_ATTENTION_CLASSES[config._attn_implementation]( + config=config, is_decoder=True, q_config=q_config["self_attn"] + ) + + self.do_layer_norm_before = config.do_layer_norm_before + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + + self.self_attn_layer_norm = nn.LayerNorm( + self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine + ) + # change the fc1 and fc2 to quantized layer as well + self.fc1 = qnn.QLinear(self.embed_dim, config.ffn_dim, bias=config.enable_bias, q_config=q_config["mlp"]["fc1"]) + self.fc2 = qnn.QLinear(config.ffn_dim, self.embed_dim, bias=config.enable_bias, q_config=q_config["mlp"]["fc2"]) + self.final_layer_norm = nn.LayerNorm(self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`, *optional*): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention + if self.do_layer_norm_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # 350m applies layer norm AFTER attention + if not self.do_layer_norm_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Fully Connected + hidden_states_shape = hidden_states.shape + hidden_states = hidden_states.reshape(-1, hidden_states.size(-1)) + residual = hidden_states + + # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention + if self.do_layer_norm_before: + hidden_states = self.final_layer_norm(hidden_states) + + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = (residual + hidden_states).view(hidden_states_shape) + + # 350m applies layer norm AFTER attention + if not self.do_layer_norm_before: + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +def build_layer_q_config_opt(q_config: dict, num_layers: int, layer_ids: list[int]): + full_q_config = {} + + if "linear" in q_config and "matmul" in q_config: + linear_q_config = q_config["linear"] + gemm_q_config = q_config["matmul"] + elif "x" in q_config and "w" in q_config: + linear_q_config = q_config + gemm_q_config = q_config + else: + linear_q_config = dict(x=q_config, w=q_config) + gemm_q_config = dict(x=q_config, w=q_config) + + for layer_id in range(num_layers): + # quantize selected layer + layer_name = f"model_layer_{layer_id}" + # quantize selected layer + + if layer_id in layer_ids: + full_q_config[layer_name] = { + "self_attn": { + "q_proj": linear_q_config, + "k_proj": linear_q_config, + "v_proj": linear_q_config, + "out_proj": linear_q_config, + "matmul_0": gemm_q_config, + "matmul_1": gemm_q_config, + }, + "mlp": { + "fc1": linear_q_config, + "fc2": linear_q_config, + }, + } + else: + full_q_config[layer_name] = { + "self_attn": { + "q_proj": None, + "k_proj": None, + "v_proj": None, + "out_proj": None, + "matmul_0": None, + "matmul_1": None, + }, + "mlp": { + "fc1": None, + "fc2": None, + }, + } + + return full_q_config + + +def quantize_decoder_layers_opt(model: OPTForCausalLM, q_config: dict, layer_ids): + q_config = build_layer_q_config_opt(q_config, model.config.num_hidden_layers, layer_ids) + + for layer_id, ori_decoder_layer in enumerate(model.model.decoder.layers): + layer_entry = f"model_layer_{layer_id}" + layer_q_config = q_config[layer_entry] + + if layer_id in layer_ids: + new_decoder_layer = OPTQuantizedDecoderLayer(model.config, layer_q_config) + new_decoder_layer.load_state_dict(ori_decoder_layer.state_dict(), strict=False) + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).dtype) + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).device) + model.model.decoder.layers[layer_id] = new_decoder_layer + + else: + continue + + model._no_split_modules = ["OPTQuantizedDecoderLayer", "OPTDecoderLayer"] + return model, q_config + + +def build_gemm_q_config_opt(q_config: dict, num_layers: int, matmul_ids: list[int]): + full_q_config = {} + + if "linear" in q_config and "matmul" in q_config: + linear_q_config = q_config["linear"] + gemm_q_config = q_config["matmul"] + elif "x" in q_config and "w" in q_config: + linear_q_config = q_config + gemm_q_config = q_config + else: + linear_q_config = dict(x=q_config, w=q_config) + gemm_q_config = dict(x=q_config, w=q_config) + + gemm_counter = 0 + + for layer_id in range(num_layers): + layer_entry = f"model_layer_{layer_id}" + full_q_config[layer_entry] = {"self_attn": {}, "mlp": {}} + + for op in ["q_proj", "k_proj", "v_proj", "out_proj", "matmul_0", "matmul_1"]: + if gemm_counter in matmul_ids: + full_q_config[layer_entry]["self_attn"][op] = linear_q_config if "matmul" in op else gemm_q_config + else: + full_q_config[layer_entry]["self_attn"][op] = None + + gemm_counter += 1 + + for op in ["fc1", "fc2"]: + if gemm_counter in matmul_ids: + full_q_config[layer_entry]["mlp"][op] = linear_q_config + else: + full_q_config[layer_entry]["mlp"][op] = None + + gemm_counter += 1 + + return full_q_config + + +def quantize_gemms_opt(model: OPTForCausalLM, q_config: dict, matmul_ids: list[int]): + q_config = build_gemm_q_config_opt(q_config, model.config.num_hidden_layers, matmul_ids) + + for layer_id, ori_decoder_layer in enumerate(model.model.decoder.layers): + layer_entry = f"model_layer_{layer_id}" + layer_q_config = q_config[layer_entry] + + if layer_id in matmul_ids: + new_decoder_layer = OPTQuantizedDecoderLayer(model.config, layer_q_config) + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).dtype) + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).device) + new_decoder_layer.load_state_dict(ori_decoder_layer.state_dict(), strict=False) + model.model.decoder.layers[layer_id] = new_decoder_layer + + else: + continue + + model._no_split_modules = ["OPTQuantizedDecoderLayer", "OPTDecoderLayer"] + return model, q_config + + +def quantize_opt(model: OPTForCausalLM, q_config: dict, op_ids: list[int], granularity: str): + """ + op_ids: if granularity is "layer", op_ids is the list of layer ids to quantize; if granularity is "gemm", op_ids is the list of gemm ids to quantize + """ + if granularity == "decoder_layer": + return quantize_decoder_layers_opt(model, q_config, op_ids) + elif granularity == "gemm": + return quantize_gemms_opt(model, q_config, op_ids) + else: + raise ValueError(f"Unknown granularity {granularity}") + + +def estimate_ops_bits_bits_opt(model, q_config, seq_len): + ops = {} + x_bits = {} + w_bits = {} + is_quantized = {} + + def is_bypass(q_config, layer_name, parent_op, op): + if q_config is None: + return True + if q_config[layer_name][parent_op][op] is None: + return True + if ( + q_config[layer_name][parent_op][op]["x"]["name"] == "bypass" + and q_config[layer_name][parent_op][op]["w"]["name"] == "bypass" + ): + return True + return False + + for i in range(model.config.num_hidden_layers): + layer_name = f"model_layer_{i}" + layer = model.model.decoder.layers[i] + for op in ["q_proj", "k_proj", "v_proj", "out_proj", "matmul_0", "matmul_1"]: + if op in ["q_proj", "k_proj", "v_proj", "out_proj"]: + problem_shape = ( + seq_len, + getattr(layer.self_attn, op).in_features, + getattr(layer.self_attn, op).out_features, + ) + else: + problem_shape = ( + seq_len, + model.config.hidden_size // model.config.num_attention_heads, + seq_len, + ) + ops[f"{layer_name}.self_attn.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + + if not is_bypass(q_config, layer_name, "self_attn", op): + is_quantized[f"{layer_name}.self_attn.{op}"] = True + ops[f"{layer_name}.self_attn.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + if q_config[layer_name]["self_attn"][op]["x"]["name"] == "bypass": + x_bits[f"{layer_name}.self_attn.{op}"] = 16 + else: + x_bits[f"{layer_name}.self_attn.{op}"] = q_config[layer_name]["self_attn"][op]["x"]["width"] + + if q_config[layer_name]["self_attn"][op]["w"]["name"] == "bypass": + w_bits[f"{layer_name}.self_attn.{op}"] = 16 + else: + w_bits[f"{layer_name}.self_attn.{op}"] = q_config[layer_name]["self_attn"][op]["w"]["width"] + else: + is_quantized[f"{layer_name}.{op}"] = False + x_bits[f"{layer_name}.self_attn.{op}"] = 16 + w_bits[f"{layer_name}.self_attn.{op}"] = 16 + + for op in ["fc1", "fc2"]: + problem_shape = (seq_len, model.config.hidden_size, model.config.ffn_dim) + + ops[f"{layer_name}.mlp.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + + # if q_config is not None and q_config[layer_name]["mlp"][op] is not None: + if not is_bypass(q_config, layer_name, "mlp", op): + is_quantized[f"{layer_name}.mlp.{op}"] = True + ops[f"{layer_name}.mlp.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + if q_config[layer_name]["mlp"][op]["x"]["name"] == "bypass": + x_bits[f"{layer_name}.mlp.{op}"] = 16 + else: + x_bits[f"{layer_name}.mlp.{op}"] = q_config[layer_name]["mlp"][op]["x"]["width"] + + if q_config[layer_name]["mlp"][op]["w"]["name"] == "bypass": + w_bits[f"{layer_name}.mlp.{op}"] = 16 + else: + w_bits[f"{layer_name}.mlp.{op}"] = q_config[layer_name]["mlp"][op]["w"]["width"] + else: + is_quantized[f"{layer_name}.mlp.{op}"] = False + x_bits[f"{layer_name}.mlp.{op}"] = 16 + w_bits[f"{layer_name}.mlp.{op}"] = 16 + + opts_bits_bits = 0 + for op in ops: + opts_bits_bits += ops[op] * x_bits[op] * w_bits[op] + + raw = dict(ops=ops, bits=x_bits, is_quantized=is_quantized) + + return opts_bits_bits, raw diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/models/phi.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/phi.py new file mode 100644 index 0000000000000000000000000000000000000000..cca3afd826ec917029f9310bbdc11be236a3f082 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/phi.py @@ -0,0 +1,519 @@ +import logging +from typing import Optional, Tuple +import math +from copy import deepcopy + +import torch +from torch import nn +from transformers.models.phi.modeling_phi import ( + PhiRotaryEmbedding, + PhiLinearScalingRotaryEmbedding, + PhiDynamicNTKScalingRotaryEmbedding, + apply_rotary_pos_emb, + ACT2FN, + PhiConfig, + Cache, + repeat_kv, + PhiForCausalLM, + PhiDecoderLayer, +) + +from ..quantize import qnn +from ..quantize import qF +from .utils import find_matched_pattern, get_layer_name + +logger = logging.getLogger(__name__) +logger.propagate = False + + +class PhiQuantizedMLP(nn.Module): + def __init__(self, config, q_config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = qnn.QLinear(config.hidden_size, config.intermediate_size, q_config["fc1"]) + self.fc2 = qnn.QLinear(config.intermediate_size, config.hidden_size, q_config["fc2"]) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class PhiQuantizedAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: PhiConfig, layer_idx: Optional[int] = None, q_config: Optional[dict] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.rope_theta = config.rope_theta + self.rotary_ndims = int(self.head_dim * config.partial_rotary_factor) + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = qnn.QLinear( + self.hidden_size, self.num_heads * self.head_dim, bias=True, q_config=q_config["q_proj"] + ) + self.k_proj = qnn.QLinear( + self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True, q_config=q_config["k_proj"] + ) + self.v_proj = qnn.QLinear( + self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True, q_config=q_config["v_proj"] + ) + self.dense = qnn.QLinear( + self.num_heads * self.head_dim, self.hidden_size, bias=True, q_config=q_config["dense"] + ) + self.q_config = q_config + + self.qk_layernorm = config.qk_layernorm + if self.qk_layernorm: + self.q_layernorm = nn.LayerNorm( + config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True + ) + self.k_layernorm = nn.LayerNorm( + config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True + ) + + self.rotary_emb = PhiRotaryEmbedding(config=self.config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + if self.qk_layernorm: + query_states = self.q_layernorm(query_states) + key_states = self.k_layernorm(key_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + + # Partial rotary embedding + query_rot, query_pass = ( + query_states[..., : self.rotary_ndims], + query_states[..., self.rotary_ndims :], + ) + key_rot, key_pass = ( + key_states[..., : self.rotary_ndims], + key_states[..., self.rotary_ndims :], + ) + # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor] + query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin) + + # [batch_size, seq_length, num_heads, head_dim] + query_states = torch.cat((query_rot, query_pass), dim=-1) + key_states = torch.cat((key_rot, key_pass), dim=-1) + + if past_key_value is not None: + cache_kwargs = { + "sin": sin, + "cos": cos, + "partial_rotation_size": self.rotary_ndims, + "cache_position": cache_position, + } + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + # Queries and keys upcast to fp32 is required by Phi-2 to avoid overflow + # attn_weights = torch.matmul( + # query_states.to(torch.float32), key_states.to(torch.float32).transpose(2, 3) + # ) / math.sqrt(self.head_dim) + + # TODO: @cheng - please review whether this will lead to model accuracy loss in phi-2 with the above comment stating upcast to fp32 is required + attn_weights = qF.q_matmul( + query_states.to(torch.float32), + key_states.transpose(2, 3).to(torch.float32), + q_config=self.q_config["matmul_0"], + ) / math.sqrt(self.head_dim) + + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights += causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + + # attn_output = torch.matmul(attn_weights, value_states) + attn_output = qF.q_matmul(attn_weights, value_states, q_config=self.q_config["matmul_1"]) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + if attn_output.size() == (self.num_heads, q_len, self.head_dim): + attn_output = attn_output.unsqueeze(0) # add batch dimension for bsz=1 + else: + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.dense(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +PHI_ATTENTION_CLASSES = { + "eager": PhiQuantizedAttention, +} + + +class PhiQuantizedDecoderLayer(nn.Module): + def __init__(self, config: PhiConfig, layer_idx: int, q_config: dict): + super().__init__() + self.self_attn = PHI_ATTENTION_CLASSES[config._attn_implementation]( + config, layer_idx=layer_idx, q_config=q_config["self_attn"] + ) + self.mlp = PhiQuantizedMLP(config, q_config["mlp"]) + self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): + input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range + `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + attn_outputs, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) + attn_outputs = self.resid_dropout(attn_outputs) + + feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states)) + hidden_states = attn_outputs + feed_forward_hidden_states + residual + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +def build_layer_q_config_phi(q_config: dict, num_layers: int, layer_ids: list[int]): + full_q_config = {} + + if "linear" in q_config and "matmul" in q_config: + linear_q_config = q_config["linear"] + gemm_q_config = q_config["matmul"] + elif "x" in q_config and "w" in q_config: + linear_q_config = q_config + gemm_q_config = q_config + else: + linear_q_config = dict(x=q_config, w=q_config) + gemm_q_config = dict(x=q_config, w=q_config) + + for layer_id in range(num_layers): + # quantize selected layer + layer_name = f"model_layer_{layer_id}" + # quantize selected layer + + if layer_id in layer_ids: + full_q_config[layer_name] = { + "self_attn": { + "q_proj": linear_q_config, + "k_proj": linear_q_config, + "v_proj": linear_q_config, + "dense": linear_q_config, + "matmul_0": gemm_q_config, + "matmul_1": gemm_q_config, + }, + "mlp": { + "fc1": linear_q_config, + "fc2": linear_q_config, + }, + } + else: + full_q_config[layer_name] = { + "self_attn": { + "q_proj": None, + "k_proj": None, + "v_proj": None, + "dense": None, + "matmul_0": None, + "matmul_1": None, + }, + "mlp": { + "fc1": None, + "fc2": None, + }, + } + + return full_q_config + + +def quantize_decoder_layers_phi(model: PhiForCausalLM, q_config: dict, layer_ids): + q_config = build_layer_q_config_phi(q_config, model.config.num_hidden_layers, layer_ids) + + for layer_id, ori_decoder_layer in enumerate(model.model.layers): + layer_entry = f"model_layer_{layer_id}" + layer_q_config = q_config[layer_entry] + + if layer_id in layer_ids: + new_decoder_layer = PhiQuantizedDecoderLayer(model.config, layer_id, layer_q_config) + ori_rope = ori_decoder_layer.self_attn.rotary_emb + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).dtype) + new_decoder_layer.self_attn.rotary_emb = ori_rope + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).device) + new_decoder_layer.load_state_dict(ori_decoder_layer.state_dict(), strict=False) + model.model.layers[layer_id] = new_decoder_layer + + else: + continue + + model._no_split_modules = ["PhiQuantizedDecoderLayer", "PhiDecoderLayer"] + return model, q_config + + +def build_gemm_q_config_phi(q_config: dict, num_layers: int, matmul_ids: list[int]): + full_q_config = {} + + if "linear" in q_config and "matmul" in q_config: + linear_q_config = q_config["linear"] + gemm_q_config = q_config["matmul"] + elif "x" in q_config and "w" in q_config: + linear_q_config = q_config + gemm_q_config = q_config + else: + linear_q_config = dict(x=q_config, w=q_config) + gemm_q_config = dict(x=q_config, w=q_config) + + gemm_counter = 0 + + for layer_id in range(num_layers): + layer_entry = f"model_layer_{layer_id}" + full_q_config[layer_entry] = {"self_attn": {}, "mlp": {}} + + for op in ["q_proj", "k_proj", "v_proj", "dense", "matmul_0", "matmul_1"]: + if gemm_counter in matmul_ids: + full_q_config[layer_entry]["self_attn"][op] = linear_q_config if "matmul" in op else gemm_q_config + else: + full_q_config[layer_entry]["self_attn"][op] = None + + gemm_counter += 1 + + for op in ["fc1", "fc2"]: + if gemm_counter in matmul_ids: + full_q_config[layer_entry]["mlp"][op] = linear_q_config + else: + full_q_config[layer_entry]["mlp"][op] = None + + gemm_counter += 1 + + return full_q_config + + +def quantize_gemms_phi(model: PhiForCausalLM, q_config: dict, matmul_ids: list[int]): + q_config = build_gemm_q_config_phi(q_config, model.config.num_hidden_layers, matmul_ids) + + for layer_id, ori_decoder_layer in enumerate(model.model.layers): + layer_entry = f"model_layer_{layer_id}" + layer_q_config = q_config[layer_entry] + + if layer_id in matmul_ids: + new_decoder_layer = PhiQuantizedDecoderLayer(model.config, layer_id, layer_q_config) + ori_rope = ori_decoder_layer.self_attn.rotary_emb + new_decoder_layer.load_state_dict(ori_decoder_layer.state_dict(), strict=False) + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).dtype) + new_decoder_layer.self_attn.rotary_emb = ori_rope + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).device) + model.model.layers[layer_id] = new_decoder_layer + + else: + continue + + model._no_split_modules = ["PhiQuantizedDecoderLayer", "PhiDecoderLayer"] + return model, q_config + + +def quantize_phi(model: PhiForCausalLM, q_config: dict, op_ids: list[int], granularity: str): + """ + op_ids: if granularity is "layer", op_ids is the list of layer ids to quantize; if granularity is "gemm", op_ids is the list of gemm ids to quantize + """ + if granularity == "decoder_layer": + return quantize_decoder_layers_phi(model, q_config, op_ids) + elif granularity == "gemm": + return quantize_gemms_phi(model, q_config, op_ids) + else: + raise ValueError(f"Unknown granularity {granularity}") + + +def estimate_ops_bits_bits_phi(model, q_config, seq_len): + ops = {} + x_bits = {} + w_bits = {} + is_quantized = {} + + def is_bypass(q_config, layer_name, parent_op, op): + if q_config is None: + return True + if q_config[layer_name][parent_op][op] is None: + return True + if ( + q_config[layer_name][parent_op][op]["x"]["name"] == "bypass" + and q_config[layer_name][parent_op][op]["w"]["name"] == "bypass" + ): + return True + return False + + for i in range(model.config.num_hidden_layers): + layer_name = f"model_layer_{i}" + layer = model.model.layers[i] + for op in ["q_proj", "k_proj", "v_proj", "dense", "matmul_0", "matmul_1"]: + if op in ["q_proj", "k_proj", "v_proj", "dense"]: + problem_shape = ( + seq_len, + getattr(layer.self_attn, op).in_features, + getattr(layer.self_attn, op).out_features, + ) + else: + problem_shape = ( + seq_len, + model.config.hidden_size // model.config.num_attention_heads, + seq_len, + ) + ops[f"{layer_name}.self_attn.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + + if not is_bypass(q_config, layer_name, "self_attn", op): + is_quantized[f"{layer_name}.self_attn.{op}"] = True + ops[f"{layer_name}.self_attn.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + if q_config[layer_name]["self_attn"][op]["x"]["name"] == "bypass": + x_bits[f"{layer_name}.self_attn.{op}"] = 16 + else: + x_bits[f"{layer_name}.self_attn.{op}"] = q_config[layer_name]["self_attn"][op]["x"]["width"] + + if q_config[layer_name]["self_attn"][op]["w"]["name"] == "bypass": + w_bits[f"{layer_name}.self_attn.{op}"] = 16 + else: + w_bits[f"{layer_name}.self_attn.{op}"] = q_config[layer_name]["self_attn"][op]["w"]["width"] + else: + is_quantized[f"{layer_name}.{op}"] = False + x_bits[f"{layer_name}.self_attn.{op}"] = 16 + w_bits[f"{layer_name}.self_attn.{op}"] = 16 + + for op in ["fc1", "fc2"]: + problem_shape = (seq_len, model.config.hidden_size, model.config.intermediate_size) + + ops[f"{layer_name}.mlp.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + + # if q_config is not None and q_config[layer_name]["mlp"][op] is not None: + if not is_bypass(q_config, layer_name, "mlp", op): + is_quantized[f"{layer_name}.mlp.{op}"] = True + ops[f"{layer_name}.mlp.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + if q_config[layer_name]["mlp"][op]["x"]["name"] == "bypass": + x_bits[f"{layer_name}.mlp.{op}"] = 16 + else: + x_bits[f"{layer_name}.mlp.{op}"] = q_config[layer_name]["mlp"][op]["x"]["width"] + + if q_config[layer_name]["mlp"][op]["w"]["name"] == "bypass": + w_bits[f"{layer_name}.mlp.{op}"] = 16 + else: + w_bits[f"{layer_name}.mlp.{op}"] = q_config[layer_name]["mlp"][op]["w"]["width"] + else: + is_quantized[f"{layer_name}.mlp.{op}"] = False + x_bits[f"{layer_name}.mlp.{op}"] = 16 + w_bits[f"{layer_name}.mlp.{op}"] = 16 + + opts_bits_bits = 0 + for op in ops: + opts_bits_bits += ops[op] * x_bits[op] * w_bits[op] + + raw = dict(ops=ops, bits=x_bits, is_quantized=is_quantized) + + return opts_bits_bits, raw diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/models/phi3.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/phi3.py new file mode 100644 index 0000000000000000000000000000000000000000..b4fcf671e2a0225858f84fe150645696dec536eb --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/phi3.py @@ -0,0 +1,507 @@ +import logging +from typing import Optional, Tuple +import math +from copy import deepcopy + +import torch +from torch import nn +from transformers.models.phi3.modeling_phi3 import ( + Phi3RMSNorm, + Phi3RotaryEmbedding, + # Phi3LongRoPEScaledRotaryEmbedding, + apply_rotary_pos_emb, + ACT2FN, + Phi3Config, + Cache, + repeat_kv, + Phi3ForCausalLM, + Phi3DecoderLayer, +) + +from ..quantize import qnn +from ..quantize import qF +from .utils import find_matched_pattern, get_layer_name + +logger = logging.getLogger(__name__) +logger.propagate = False + + +class Phi3QuantizedMLP(nn.Module): + def __init__(self, config, q_config): + super().__init__() + + self.config = config + # self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False) + # self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + self.gate_up_proj = qnn.QLinear( + config.hidden_size, 2 * config.intermediate_size, bias=False, q_config=q_config["gate_up_proj"] + ) + self.down_proj = qnn.QLinear( + config.intermediate_size, config.hidden_size, bias=False, q_config=q_config["down_proj"] + ) + + self.activation_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + up_states = self.gate_up_proj(hidden_states) + + gate, up_states = up_states.chunk(2, dim=-1) + up_states = up_states * self.activation_fn(gate) + + return self.down_proj(up_states) + + +class Phi3QuantizedAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Phi3Config, layer_idx: Optional[int], q_config: dict): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.original_max_position_embeddings = config.original_max_position_embeddings + self.rope_theta = config.rope_theta + self.rope_scaling = config.rope_scaling + self.is_causal = True + self.q_config = q_config + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim) + # self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + # self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False) + self.o_proj = qnn.QLinear( + self.num_heads * self.head_dim, self.hidden_size, bias=False, q_config=q_config["o_proj"] + ) + self.qkv_proj = qnn.QLinear(self.hidden_size, op_size, bias=False, q_config=q_config["qkv_proj"]) + self._init_rope() + + def _init_rope(self): + if self.rope_scaling is None: + self.rotary_emb = Phi3RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + if scaling_type == "longrope": + # self.rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(self.head_dim, self.config) + # in 4.42.3 we ignore the long context rope for now, note that this affect the maxiumum context length + raise NotImplementedError("LongRoPE is not supported in this version.") + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + logger.warning_once("You are not running the flash-attention implementation, expect numerical differences.") + + bsz, q_len, _ = hidden_states.size() + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + # attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + attn_weights = qF.q_matmul( + query_states, key_states.transpose(2, 3), q_config=self.q_config["matmul_0"] + ) / math.sqrt(self.head_dim) + + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights += causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + + # attn_output = torch.matmul(attn_weights, value_states) + attn_output = qF.q_matmul(attn_weights, value_states, q_config=self.q_config["matmul_1"]) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + if attn_output.size() == (self.num_heads, q_len, self.head_dim): + attn_output = attn_output.unsqueeze(0) # add batch dimension for bsz=1 + else: + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +PHI3_ATTENTION_CLASSES = { + "eager": Phi3QuantizedAttention, +} + + +class Phi3QuantizedDecoderLayer(nn.Module): + def __init__(self, config: Phi3Config, layer_idx: int, q_config: dict): + super().__init__() + + self.config = config + self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation]( + config, layer_idx=layer_idx, q_config=q_config["self_attn"] + ) + + self.mlp = Phi3QuantizedMLP(config, q_config["mlp"]) + self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.resid_attn_dropout = nn.Dropout(config.resid_pdrop) + self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop) + self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): + input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range + `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + attn_outputs, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + + hidden_states = residual + self.resid_attn_dropout(attn_outputs) + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + self.resid_mlp_dropout(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +def build_layer_q_config_phi3(q_config: dict, num_layers: int, layer_ids: list[int]): + full_q_config = {} + + if "linear" in q_config and "matmul" in q_config: + linear_q_config = q_config["linear"] + gemm_q_config = q_config["matmul"] + elif "x" in q_config and "w" in q_config: + linear_q_config = q_config + gemm_q_config = q_config + else: + linear_q_config = dict(x=q_config, w=q_config) + gemm_q_config = dict(x=q_config, w=q_config) + + for layer_id in range(num_layers): + # quantize selected layer + layer_name = f"model_layer_{layer_id}" + # quantize selected layer + + if layer_id in layer_ids: + full_q_config[layer_name] = { + "self_attn": { + "qkv_proj": linear_q_config, + "o_proj": linear_q_config, + "matmul_0": gemm_q_config, + "matmul_1": gemm_q_config, + }, + "mlp": { + "gate_up_proj": linear_q_config, + "down_proj": linear_q_config, + }, + } + else: + full_q_config[layer_name] = { + "self_attn": { + "qkv_proj": None, + "o_proj": None, + "matmul_0": None, + "matmul_1": None, + }, + "mlp": { + "gate_up_proj": None, + "down_proj": None, + }, + } + + return full_q_config + + +def quantize_decoder_layers_phi3(model: Phi3ForCausalLM, q_config: dict, layer_ids): + q_config = build_layer_q_config_phi3(q_config, model.config.num_hidden_layers, layer_ids) + + for layer_id, ori_decoder_layer in enumerate(model.model.layers): + layer_entry = f"model_layer_{layer_id}" + layer_q_config = q_config[layer_entry] + + if layer_id in layer_ids: + new_decoder_layer = Phi3QuantizedDecoderLayer(model.config, layer_id, layer_q_config) + ori_rope = ori_decoder_layer.self_attn.rotary_emb + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).dtype) + new_decoder_layer.self_attn.rotary_emb = ori_rope + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).device) + new_decoder_layer.load_state_dict(ori_decoder_layer.state_dict(), strict=False) + model.model.layers[layer_id] = new_decoder_layer + + else: + continue + + model._no_split_modules = ["Phi3QuantizedDecoderLayer", "Phi3DecoderLayer"] + return model, q_config + + +def build_gemm_q_config_phi3(q_config: dict, num_layers: int, matmul_ids: list[int]): + full_q_config = {} + + if "linear" in q_config and "matmul" in q_config: + linear_q_config = q_config["linear"] + gemm_q_config = q_config["matmul"] + elif "x" in q_config and "w" in q_config: + linear_q_config = q_config + gemm_q_config = q_config + else: + linear_q_config = dict(x=q_config, w=q_config) + gemm_q_config = dict(x=q_config, w=q_config) + + gemm_counter = 0 + + for layer_id in range(num_layers): + layer_entry = f"model_layer_{layer_id}" + full_q_config[layer_entry] = {"self_attn": {}, "mlp": {}} + + for op in ["qkv_proj", "o_proj", "matmul_0", "matmul_1"]: + if gemm_counter in matmul_ids: + full_q_config[layer_entry]["self_attn"][op] = linear_q_config if "matmul" in op else gemm_q_config + else: + full_q_config[layer_entry]["self_attn"][op] = None + + gemm_counter += 1 + + for op in ["gate_up_proj", "down_proj"]: + if gemm_counter in matmul_ids: + full_q_config[layer_entry]["mlp"][op] = linear_q_config + else: + full_q_config[layer_entry]["mlp"][op] = None + + gemm_counter += 1 + + return full_q_config + + +def quantize_gemms_phi3(model: Phi3ForCausalLM, q_config: dict, matmul_ids: list[int]): + q_config = build_gemm_q_config_phi3(q_config, model.config.num_hidden_layers, matmul_ids) + + for layer_id, ori_decoder_layer in enumerate(model.model.layers): + layer_entry = f"model_layer_{layer_id}" + layer_q_config = q_config[layer_entry] + + if layer_id in matmul_ids: + new_decoder_layer = Phi3QuantizedDecoderLayer(model.config, layer_id, layer_q_config) + ori_rope = ori_decoder_layer.self_attn.rotary_emb + new_decoder_layer.load_state_dict(ori_decoder_layer.state_dict(), strict=False) + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).dtype) + new_decoder_layer.self_attn.rotary_emb = ori_rope + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).device) + model.model.layers[layer_id] = new_decoder_layer + + else: + continue + + model._no_split_modules = ["Phi3QuantizedDecoderLayer", "Phi3DecoderLayer"] + return model, q_config + + +def quantize_phi3(model: Phi3ForCausalLM, q_config: dict, op_ids: list[int], granularity: str): + """ + op_ids: if granularity is "layer", op_ids is the list of layer ids to quantize; if granularity is "gemm", op_ids is the list of gemm ids to quantize + """ + if granularity == "decoder_layer": + return quantize_decoder_layers_phi3(model, q_config, op_ids) + elif granularity == "gemm": + return quantize_gemms_phi3(model, q_config, op_ids) + else: + raise ValueError(f"Unknown granularity {granularity}") + + +def estimate_ops_bits_bits_phi3(model, q_config, seq_len): + ops = {} + x_bits = {} + w_bits = {} + is_quantized = {} + + def is_bypass(q_config, layer_name, parent_op, op): + if q_config is None: + return True + if q_config[layer_name][parent_op][op] is None: + return True + if ( + q_config[layer_name][parent_op][op]["x"]["name"] == "bypass" + and q_config[layer_name][parent_op][op]["w"]["name"] == "bypass" + ): + return True + return False + + for i in range(model.config.num_hidden_layers): + layer_name = f"model_layer_{i}" + layer = model.model.layers[i] + for op in ["qkv_proj", "o_proj", "matmul_0", "matmul_1"]: + if op in ["qkv_proj", "o_proj"]: + problem_shape = ( + seq_len, + getattr(layer.self_attn, op).in_features, + getattr(layer.self_attn, op).out_features, + ) + else: + problem_shape = ( + seq_len, + model.config.hidden_size // model.config.num_attention_heads, + seq_len, + ) + ops[f"{layer_name}.self_attn.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + + if not is_bypass(q_config, layer_name, "self_attn", op): + is_quantized[f"{layer_name}.self_attn.{op}"] = True + ops[f"{layer_name}.self_attn.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + if q_config[layer_name]["self_attn"][op]["x"]["name"] == "bypass": + x_bits[f"{layer_name}.self_attn.{op}"] = 16 + else: + x_bits[f"{layer_name}.self_attn.{op}"] = q_config[layer_name]["self_attn"][op]["x"]["width"] + + if q_config[layer_name]["self_attn"][op]["w"]["name"] == "bypass": + w_bits[f"{layer_name}.self_attn.{op}"] = 16 + else: + w_bits[f"{layer_name}.self_attn.{op}"] = q_config[layer_name]["self_attn"][op]["w"]["width"] + else: + is_quantized[f"{layer_name}.{op}"] = False + x_bits[f"{layer_name}.self_attn.{op}"] = 16 + w_bits[f"{layer_name}.self_attn.{op}"] = 16 + + for op in ["gate_up_proj", "down_proj"]: + problem_shape = (seq_len, model.config.hidden_size, model.config.intermediate_size) + + ops[f"{layer_name}.mlp.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + + # if q_config is not None and q_config[layer_name]["mlp"][op] is not None: + if not is_bypass(q_config, layer_name, "mlp", op): + is_quantized[f"{layer_name}.mlp.{op}"] = True + ops[f"{layer_name}.mlp.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + if q_config[layer_name]["mlp"][op]["x"]["name"] == "bypass": + x_bits[f"{layer_name}.mlp.{op}"] = 16 + else: + x_bits[f"{layer_name}.mlp.{op}"] = q_config[layer_name]["mlp"][op]["x"]["width"] + + if q_config[layer_name]["mlp"][op]["w"]["name"] == "bypass": + w_bits[f"{layer_name}.mlp.{op}"] = 16 + else: + w_bits[f"{layer_name}.mlp.{op}"] = q_config[layer_name]["mlp"][op]["w"]["width"] + else: + is_quantized[f"{layer_name}.mlp.{op}"] = False + x_bits[f"{layer_name}.mlp.{op}"] = 16 + w_bits[f"{layer_name}.mlp.{op}"] = 16 + + opts_bits_bits = 0 + for op in ops: + opts_bits_bits += ops[op] * x_bits[op] * w_bits[op] + + raw = dict(ops=ops, bits=x_bits, is_quantized=is_quantized) + + return opts_bits_bits, raw diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/models/qwen2.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/qwen2.py new file mode 100644 index 0000000000000000000000000000000000000000..3374681a2ed01855247a93c74c77b864ebe13744 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/qwen2.py @@ -0,0 +1,480 @@ +import logging +from typing import Optional, Tuple +import math +from copy import deepcopy + +import torch +from torch import nn +from transformers.models.qwen2.modeling_qwen2 import ( + Qwen2RMSNorm, + Qwen2RotaryEmbedding, + apply_rotary_pos_emb, + ACT2FN, + Qwen2Config, + Cache, + repeat_kv, + Qwen2ForCausalLM, + Qwen2DecoderLayer, + + +) + +from ..quantize import qnn +from ..quantize import qF +from .utils import find_matched_pattern, get_layer_name + +logger = logging.getLogger(__name__) +logger.propagate = False + + +class Qwen2QuantizedMLP(nn.Module): + def __init__(self, config: Qwen2Config, q_config: dict): + super().__init__() + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + + self.gate_proj = qnn.QLinear(self.hidden_size, self.intermediate_size, bias=False,q_config=q_config["gate_proj"]) + self.up_proj = qnn.QLinear(self.hidden_size, self.intermediate_size, bias=False, q_config=q_config["up_proj"]) + self.down_proj = qnn.QLinear(self.intermediate_size, self.hidden_size, bias=False, q_config=q_config["down_proj"]) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_state): + return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state)) + +class Qwen2QuantizedAttention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: Qwen2Config, layer_idx: Optional[int], q_config: dict): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = qnn.QLinear(self.hidden_size, self.num_heads * self.head_dim, bias=True, q_config=q_config["q_proj"]) + self.k_proj = qnn.QLinear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True, q_config=q_config["k_proj"]) + self.v_proj = qnn.QLinear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True, q_config=q_config["v_proj"]) + self.o_proj = qnn.QLinear(self.num_heads * self.head_dim, self.hidden_size, bias=False, q_config=q_config["o_proj"]) + self.q_config = q_config + + self.rotary_emb = Qwen2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + # attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + attn_weights = qF.q_matmul(query_states, key_states.transpose(2, 3), q_config=self.q_config["matmul_0"]) / math.sqrt(self.head_dim) + + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + # attn_output = torch.matmul(attn_weights, value_states) + attn_output = qF.q_matmul(attn_weights, value_states, q_config=self.q_config["matmul_1"]) + + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + # in the case of bsz=1, reshape attn_output from (64,1024,128) to (1,64,1024,128) + if attn_output.size() == (self.num_heads, q_len, self.head_dim): + attn_output = attn_output.unsqueeze(0) + else: + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + +QWEN2_ATTENTION_CLASSES = { + "eager": Qwen2QuantizedAttention, + # "sdpa" : Qwen2QuantizedAttention, #set both attn implementation to the same class for now +} + +class Qwen2QuantizedDecoderLayer(nn.Module): + def __init__(self, config: Qwen2Config, layer_idx: int, q_config: dict): + super().__init__() + self.hidden_size = config.hidden_size + + assert config._attn_implementation == "eager", "Only eager attention is supported." + self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx, q_config=q_config["self_attn"]) + + self.mlp = Qwen2QuantizedMLP(config, q_config=q_config["mlp"]) + self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + +def build_layer_q_config_qwen2(q_config: dict, num_layers: int, layer_ids: list[int]): + full_q_config = {} + + if "linear" in q_config and "matmul" in q_config: + linear_q_config = q_config["linear"] + gemm_q_config = q_config["matmul"] + elif "x" in q_config and "w" in q_config: + linear_q_config = q_config + gemm_q_config = q_config + else: + linear_q_config = dict(x=q_config, w=q_config) + gemm_q_config = dict(x=q_config, w=q_config) + + for layer_id in range(num_layers): + # quantize selected layer + layer_name = f"model_layer_{layer_id}" + # quantize selected layer + + if layer_id in layer_ids: + full_q_config[layer_name] = { + "self_attn": { + "q_proj": linear_q_config, + "k_proj": linear_q_config, + "v_proj": linear_q_config, + "o_proj": linear_q_config, + "matmul_0": gemm_q_config, + "matmul_1": gemm_q_config, + }, + "mlp": { + "gate_proj": linear_q_config, + "down_proj": linear_q_config, + "up_proj": linear_q_config, + }, + } + else: + full_q_config[layer_name] = { + "self_attn": { + "q_proj": None, + "k_proj": None, + "v_proj": None, + "o_proj": None, + "matmul_0": None, + "matmul_1": None, + }, + "mlp": { + "gate_proj": None, + "down_proj": None, + "up_proj": None, + }, + } + + return full_q_config + + +def quantize_decoder_layers_qwen2(model: Qwen2ForCausalLM, q_config: dict, layer_ids): + q_config = build_layer_q_config_qwen2(q_config, model.config.num_hidden_layers, layer_ids) + + for layer_id, ori_decoder_layer in enumerate(model.model.layers): + layer_entry = f"model_layer_{layer_id}" + layer_q_config = q_config[layer_entry] + + if layer_id in layer_ids: + new_decoder_layer = Qwen2QuantizedDecoderLayer(model.config, layer_id, layer_q_config) + ori_rope = ori_decoder_layer.self_attn.rotary_emb + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).dtype) + new_decoder_layer.self_attn.rotary_emb = ori_rope + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).device) + new_decoder_layer.load_state_dict(ori_decoder_layer.state_dict(), strict=False) + model.model.layers[layer_id] = new_decoder_layer + + else: + continue + + model._no_split_modules = ["Qwen2QuantizedDecoderLayer", "Qwen2DecoderLayer"] + return model, q_config + + +def build_gemm_q_config_qwen2(q_config: dict, num_layers: int, matmul_ids: list[int]): + full_q_config = {} + + if "linear" in q_config and "matmul" in q_config: + linear_q_config = q_config["linear"] + gemm_q_config = q_config["matmul"] + elif "x" in q_config and "w" in q_config: + linear_q_config = q_config + gemm_q_config = q_config + else: + linear_q_config = dict(x=q_config, w=q_config) + gemm_q_config = dict(x=q_config, w=q_config) + + gemm_counter = 0 + + for layer_id in range(num_layers): + layer_entry = f"model_layer_{layer_id}" + full_q_config[layer_entry] = {"self_attn": {}, "mlp": {}} + + for op in ["q_proj", "k_proj", "v_proj", "o_proj", "matmul_0", "matmul_1"]: + if gemm_counter in matmul_ids: + full_q_config[layer_entry]["self_attn"][op] = linear_q_config if "matmul" in op else gemm_q_config + else: + full_q_config[layer_entry]["self_attn"][op] = None + + gemm_counter += 1 + + for op in ["gate_proj", "down_proj", "up_proj"]: + if gemm_counter in matmul_ids: + full_q_config[layer_entry]["mlp"][op] = linear_q_config + else: + full_q_config[layer_entry]["mlp"][op] = None + + gemm_counter += 1 + + return full_q_config + + +def quantize_gemms_qwen2(model: Qwen2ForCausalLM, q_config: dict, matmul_ids: list[int]): + q_config = build_gemm_q_config_qwen2(q_config, model.config.num_hidden_layers, matmul_ids) + + for layer_id, ori_decoder_layer in enumerate(model.model.layers): + layer_entry = f"model_layer_{layer_id}" + layer_q_config = q_config[layer_entry] + + if layer_id in matmul_ids: + new_decoder_layer = Qwen2QuantizedDecoderLayer(model.config, layer_id, layer_q_config) + ori_rope = ori_decoder_layer.self_attn.rotary_emb + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).dtype) + new_decoder_layer.self_attn.rotary_emb = ori_rope + new_decoder_layer.to(next(iter(ori_decoder_layer.parameters())).device) + new_decoder_layer.load_state_dict(ori_decoder_layer.state_dict(), strict=False) + model.model.layers[layer_id] = new_decoder_layer + + else: + continue + + model._no_split_modules = ["Qwen2QuantizedDecoderLayer", "Qwen2DecoderLayer"] + return model, q_config + + +def quantize_qwen2(model: Qwen2ForCausalLM, q_config: dict, op_ids: list[int], granularity: str): + """ + op_ids: if granularity is "layer", op_ids is the list of layer ids to quantize; if granularity is "gemm", op_ids is the list of gemm ids to quantize + """ + if granularity == "decoder_layer": + return quantize_decoder_layers_qwen2(model, q_config, op_ids) + elif granularity == "gemm": + return quantize_gemms_qwen2(model, q_config, op_ids) + else: + raise ValueError(f"Unknown granularity {granularity}") + + +def estimate_ops_bits_bits_qwen2(model, q_config, seq_len): + ops = {} + x_bits = {} + w_bits = {} + is_quantized = {} + + def is_bypass(q_config, layer_name, parent_op, op): + if q_config is None: + return True + if q_config[layer_name][parent_op][op] is None: + return True + if ( + q_config[layer_name][parent_op][op]["x"]["name"] == "bypass" + and q_config[layer_name][parent_op][op]["w"]["name"] == "bypass" + ): + return True + return False + + for i in range(model.config.num_hidden_layers): + layer_name = f"model_layer_{i}" + layer = model.model.layers[i] + for op in ["q_proj", "k_proj", "v_proj", "o_proj", "matmul_0", "matmul_1"]: + if op in ["q_proj", "k_proj", "v_proj", "o_proj"]: + problem_shape = ( + seq_len, + getattr(layer.self_attn, op).in_features, + getattr(layer.self_attn, op).out_features, + ) + else: + problem_shape = ( + seq_len, + model.config.hidden_size // model.config.num_attention_heads, + seq_len, + ) + ops[f"{layer_name}.self_attn.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + + if not is_bypass(q_config, layer_name, "self_attn", op): + is_quantized[f"{layer_name}.self_attn.{op}"] = True + ops[f"{layer_name}.self_attn.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + if q_config[layer_name]["self_attn"][op]["x"]["name"] == "bypass": + x_bits[f"{layer_name}.self_attn.{op}"] = 16 + else: + x_bits[f"{layer_name}.self_attn.{op}"] = q_config[layer_name]["self_attn"][op]["x"]["width"] + + if q_config[layer_name]["self_attn"][op]["w"]["name"] == "bypass": + w_bits[f"{layer_name}.self_attn.{op}"] = 16 + else: + w_bits[f"{layer_name}.self_attn.{op}"] = q_config[layer_name]["self_attn"][op]["w"]["width"] + else: + is_quantized[f"{layer_name}.{op}"] = False + x_bits[f"{layer_name}.self_attn.{op}"] = 16 + w_bits[f"{layer_name}.self_attn.{op}"] = 16 + + for op in ["gate_proj", "down_proj", "up_proj"]: + problem_shape = (seq_len, model.config.hidden_size, model.config.intermediate_size) + + ops[f"{layer_name}.mlp.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + + # if q_config is not None and q_config[layer_name]["mlp"][op] is not None: + if not is_bypass(q_config, layer_name, "mlp", op): + is_quantized[f"{layer_name}.mlp.{op}"] = True + ops[f"{layer_name}.mlp.{op}"] = problem_shape[0] * problem_shape[1] * problem_shape[2] * 2 + if q_config[layer_name]["mlp"][op]["x"]["name"] == "bypass": + x_bits[f"{layer_name}.mlp.{op}"] = 16 + else: + x_bits[f"{layer_name}.mlp.{op}"] = q_config[layer_name]["mlp"][op]["x"]["width"] + + if q_config[layer_name]["mlp"][op]["w"]["name"] == "bypass": + w_bits[f"{layer_name}.mlp.{op}"] = 16 + else: + w_bits[f"{layer_name}.mlp.{op}"] = q_config[layer_name]["mlp"][op]["w"]["width"] + else: + is_quantized[f"{layer_name}.mlp.{op}"] = False + x_bits[f"{layer_name}.mlp.{op}"] = 16 + w_bits[f"{layer_name}.mlp.{op}"] = 16 + + opts_bits_bits = 0 + for op in ops: + opts_bits_bits += ops[op] * x_bits[op] * w_bits[op] + + raw = dict(ops=ops, bits=x_bits, is_quantized=is_quantized) + + return opts_bits_bits, raw diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/models/utils.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3e63b500a5731f78da45503fe7b616fec86e4e34 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/models/utils.py @@ -0,0 +1,149 @@ +import os +import sys +import re +from itertools import chain +import torch +from accelerate import infer_auto_device_map +from nvitop import CudaDevice, parse_cuda_visible_devices + + +def get_all_device_mem_info() -> dict[int, dict[str, int]]: + visible_devices = parse_cuda_visible_devices( + os.getenv("CUDA_VISIBLE_DEVICES", None) + ) + visible_devices = [CudaDevice(i) for i in visible_devices] + memory_info = {} + for device in visible_devices: + mem_info_i = device.memory_info() + memory_info[device.index] = { + "total (GB)": round(mem_info_i.total / 1024**3, 2), + "used (GB)": round(mem_info_i.used / 1024**3, 2), + "free (GB)": round(mem_info_i.free / 1024**3, 2), + } + return memory_info + + +def find_matched_pattern(query: str, patterns: list[str]) -> str | None: + patterns: list[re.Pattern] = [re.compile(pattern) for pattern in patterns] + + matched_patterns = [] + + for pattern in patterns: + if pattern.fullmatch(query): + matched_patterns.append(pattern) + + if len(matched_patterns) > 1: + raise ValueError(f"Multiple patterns matched: {matched_patterns}") + + return matched_patterns[0].pattern if len(matched_patterns) == 1 else None + + +def get_layer_name(module, layer): + # get the name of the op relative to the module + for name, m in module.named_modules(): + if m is layer: + return name + raise ValueError(f"Cannot find op {layer} in module {module}") + + +def get_layer_by_name(module, layer_name): + # get the op by its name relative to the module + for name, m in module.named_modules(): + if name == layer_name: + return m + raise ValueError(f"Cannot find op {layer_name} in module {module}") + + +def set_layer_by_name(module, name, new_layer): + levels = name.split(".") + if len(levels) > 1: + mod_ = module + for l_idx in range(len(levels) - 1): + if levels[l_idx].isdigit(): + mod_ = mod_[int(levels[l_idx])] + else: + mod_ = getattr(mod_, levels[l_idx]) + setattr(mod_, levels[-1], new_layer) + else: + setattr(module, name, new_layer) + + +def create_device_map(model, device_map) -> dict[str, int]: + if device_map == "auto": + device_map = infer_auto_device_map( + model, + no_split_module_classes=model._no_split_modules, + ) + elif device_map == "auto-balanced": + max_memory = { + i: torch.cuda.mem_get_info(i)[0] // 2 + for i in range(torch.cuda.device_count()) + } + device_map = infer_auto_device_map( + model, + no_split_module_classes=model._no_split_modules, + max_memory=max_memory, + ) + n_devices = torch.cuda.device_count() + n_decoder_layers = model.config.num_hidden_layers + n_layers_per_device = n_decoder_layers // n_devices + balanced_device_map = {} + current_device = 0 + current_decoder_idx = 0 + + for layer_name in device_map: + if ".layers." in layer_name: + if (current_decoder_idx + 1) % n_layers_per_device == 0: + current_device += 1 + current_decoder_idx += 1 + balanced_device_map[layer_name] = min(current_device, n_devices - 1) + device_map = balanced_device_map + else: + assert isinstance(device_map, dict) + return device_map + + +def enable_exception_hook(debugger="ipdb"): + + if debugger == "pudb": + + def excepthook(etype, evalue, etb): + from IPython.core import ultratb + import pudb + + ultratb.FormattedTB()(etype, evalue, etb) + for exc in [KeyboardInterrupt, FileNotFoundError]: + if issubclass(etype, exc): + sys.exit(-1) + pudb.post_mortem(etb) + + elif debugger == "ipdb": + + def excepthook(etype, evalue, etb): + from IPython.core import ultratb + import ipdb + + ultratb.FormattedTB()(etype, evalue, etb) + for exc in [KeyboardInterrupt, FileNotFoundError]: + if issubclass(etype, exc): + sys.exit(-1) + ipdb.post_mortem(etb) + + else: + raise ValueError(f"Unknown debugger: {debugger}") + + +def get_full_device_map(model: torch.nn.Module): + device_map = {} + for name, module in model.named_modules(): + try: + device_map[name] = next(chain(module.parameters(), module.buffers())).device + except StopIteration: + pass + return device_map + + +def move_module_to_device(module, device_map: dict): + for name, device in device_map.items(): + module_ = get_layer_by_name(module, name) + module_.to(device) diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/__init__.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..918700bbf5e80f1c236165db4462371399909eaf --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/__init__.py @@ -0,0 +1 @@ +from .quantize import quantize_model diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qF/__init__.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qF/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..09d30c06bbd7ac951128b7b9fd67d7b0939f2b17 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qF/__init__.py @@ -0,0 +1 @@ +from .matmul import q_matmul diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qF/matmul.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qF/matmul.py new file mode 100644 index 0000000000000000000000000000000000000000..d0dd51505d0b7076cc73e6e85c56848072406e12 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qF/matmul.py @@ -0,0 +1,37 @@ +from copy import deepcopy +from functools import partial +import torch +from ..quantizers import get_quantizer + + +def q_matmul(input: torch.Tensor, other: torch.Tensor, q_config): + ori_shape_a = input.size() + ori_shape_b = other.size() + if input.ndim > 2 or other.ndim > 2: + error_msg = f"The batch size of input and other must be the same, but got {ori_shape_a} and {ori_shape_b}" + assert ori_shape_a[:-2] == ori_shape_b[:-2], error_msg + + input = input.reshape(-1, ori_shape_a[-2], ori_shape_a[-1]) + other = other.reshape(-1, ori_shape_b[-2], ori_shape_b[-1]) + + if q_config is None: + output = torch.matmul(input, other) + else: + a_q_config = deepcopy(q_config["x"]) + b_q_config = deepcopy(q_config["w"]) + + a_quantizer = partial(get_quantizer(a_q_config.pop("name")), **a_q_config) + b_quantizer = partial(get_quantizer(b_q_config.pop("name")), **b_q_config) + + output = torch.matmul(a_quantizer(input), b_quantizer(other)) + + if len(ori_shape_a) == 2: + output = output.reshape(ori_shape_a[0], ori_shape_b[-1]) + elif len(ori_shape_a) == 3: + output = output.reshape(ori_shape_a[0], ori_shape_a[1], ori_shape_b[-1]) + elif len(ori_shape_a) > 3: + output = output.reshape(*ori_shape_a[:-2], ori_shape_a[-2], ori_shape_b[-1]) + else: + raise ValueError("Invalid shape") + + return output diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qnn/__init__.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qnn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f02f608b5195c7ee8f0bcb4de92ab3548e8d91cb --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qnn/__init__.py @@ -0,0 +1,3 @@ +from .conv2d import QConv2d +from .linear import QLinear +from .built_in import wrap_forward_built_in_dtype diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qnn/built_in.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qnn/built_in.py new file mode 100644 index 0000000000000000000000000000000000000000..8f7d9d64897fc61ebb81462b9bcaacb9de2914ee --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qnn/built_in.py @@ -0,0 +1,50 @@ +import types +import torch + + +def wrap_forward_built_in_dtype(ori_layer: torch.nn.Module, q_tag: str): + """ + replaces forward method to cast input and weight to built-in dtype, perform dtype-specific computation, and cast back to original dtype + """ + ori_dtype = ori_layer.weight.dtype + + match q_tag: + case "fp32": + q_dtype = torch.float32 + case "fp16": + q_dtype = torch.float16 + case "bf16": + q_dtype = torch.bfloat16 + case _: + raise RuntimeError(f"Unsupported quantizer tag: {q_tag}") + + def linear_forward(self: torch.nn.Linear, x): + x = x.to(q_dtype) + w = self.weight.to(q_dtype) + if self.bias is not None: + bias = self.bias.to(q_dtype) + else: + bias = None + output = torch.nn.functional.linear(x, w, bias) + output = output.to(ori_dtype) + return output + + def conv2d_forward(self: torch.nn.Conv2d, x): + x = x.to(q_dtype) + w = self.weight.to(q_dtype) + if self.bias is not None: + bias = self.bias.to(q_dtype) + else: + bias = None + output = torch.nn.functional.conv2d(x, w, bias, self.stride, self.padding, self.dilation, self.groups) + output = output.to(ori_dtype) + return output + + if isinstance(ori_layer, torch.nn.Linear): + ori_layer.forward = types.MethodType(linear_forward, ori_layer) + elif isinstance(ori_layer, torch.nn.Conv2d): + ori_layer.forward = types.MethodType(conv2d_forward, ori_layer) + else: + raise RuntimeError(f"Unsupported layer type: {type(ori_layer)}") + + return ori_layer diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qnn/conv2d.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qnn/conv2d.py new file mode 100644 index 0000000000000000000000000000000000000000..5579a6cddbd55ae456ce726c27411b29f94f37b4 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qnn/conv2d.py @@ -0,0 +1,93 @@ +from copy import deepcopy +from functools import partial + +import torch.nn as nn +from torch.nn import functional as F + +from ..quantizers import get_quantizer + + +class QConv2d(nn.Conv2d): + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int | F.Tuple[int], + stride: int | F.Tuple[int] = 1, + padding: str | int | F.Tuple[int] = 0, + dilation: int | F.Tuple[int] = 1, + groups: int = 1, + bias: bool = True, + padding_mode: str = "zeros", + device=None, + dtype=None, + q_config=None, + ) -> None: + super().__init__( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode, + device, + dtype, + ) + + if q_config is None: + self.bypass = True + else: + self.bypass = q_config.get("bypass", False) + + self.q_config = q_config + + if not self.bypass: + x_q_config = deepcopy(q_config["x"]) + w_q_config = deepcopy(q_config["w"]) + + self.x_quantizer = partial(get_quantizer(x_q_config.pop("name")), **x_q_config) + self.w_quantizer = partial(get_quantizer(w_q_config.pop("name")), **w_q_config) + + def forward(self, x): + if self.bypass: + return F.conv2d( + x, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.groups, + ) + else: + x = self.x_quantizer(x) + try: + w = self.w_quantizer(self.weight) + except Exception as e: + breakpoint() + print(e) + + bias = None + if self.bias is not None: + bias = self.w_quantizer(self.bias) + + return F.conv2d( + x, + w, + bias, + self.stride, + self.padding, + self.dilation, + self.groups, + ) + + def __repr__(self): + if self.bypass: + q_config_str = "bypass=True" + else: + q_config_str = f"x_quantizer={self.q_config['x']['name']}, w_quantizer={self.q_config['w']['name']}" + + return f"QConv2d(in_channels={self.in_channels}, out_channels={self.out_channels}, kernel_size={self.kernel_size}, stride={self.stride}, padding={self.padding}, dilation={self.dilation}, groups={self.groups}, bias={self.bias is not None}, padding_mode={self.padding_mode}, {q_config_str})" diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qnn/linear.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qnn/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..d4243a1855e96f911ed774d6ae0ee903eabac879 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/qnn/linear.py @@ -0,0 +1,57 @@ +from copy import deepcopy +from functools import partial + +import torch.nn as nn +from torch.nn import functional as F + +from ..quantizers import get_quantizer + + +class QLinear(nn.Linear): + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + q_config=None, + ) -> None: + super().__init__(in_features, out_features, bias, device, dtype) + if q_config is None: + self.bypass = True + else: + self.bypass = q_config.get("bypass", False) + + self.q_config = q_config + + if not self.bypass: + x_q_config = deepcopy(q_config["x"]) + w_q_config = deepcopy(q_config["w"]) + + self.x_quantizer = partial(get_quantizer(x_q_config.pop("name")), **x_q_config) + self.w_quantizer = partial(get_quantizer(w_q_config.pop("name")), **w_q_config) + + self.w_is_quantized = False + + def forward(self, x): + if self.bypass: + return F.linear(x, self.weight, self.bias) + else: + x = self.x_quantizer(x) + if not self.w_is_quantized: + self.weight.copy_(self.w_quantizer(self.weight)) + self.w_is_quantized = True + # if self.bias is not None: + # self.bias = self.w_quantizer(self.bias) + + w = self.weight + bias = self.bias + return F.linear(x, w, bias) + + def __repr__(self): + if self.bypass: + q_config_str = "bypass=True" + else: + q_config_str = f"x_quantizer={self.q_config['x']['name']}, w_quantizer={self.q_config['w']['name']}" + return f"QLinear(in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}, {q_config_str})" diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantize.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantize.py new file mode 100644 index 0000000000000000000000000000000000000000..36846896af2faa4637fa4a57648cbbd64f3301c2 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantize.py @@ -0,0 +1,199 @@ +import logging +from copy import deepcopy +import torch + +from .qnn import QConv2d, QLinear, wrap_forward_built_in_dtype +from .utils import ( + set_layer_by_name, + find_matched_pattern, +) + +logger = logging.getLogger(__name__) + +DEFAULT_Q_CONFIGS = { + # built-in: + "fp32": {"x": {"name": "fp32"}, "w": {"name": "fp32"}}, + "fp16": {"x": {"name": "fp16"}, "w": {"name": "fp16"}}, + "bf16": {"x": {"name": "bf16"}, "w": {"name": "bf16"}}, + # emulated: + "int8-dynamic": { + "x": { + "name": "integer", + "fp_min": None, + "fp_max": None, + "n_bits": 8, + "is_affine": True, + }, + "w": { + "name": "integer", + "fp_min": None, + "fp_max": None, + "n_bits": 8, + "is_affine": True, + }, + }, + "fp8-e4m3": { + "x": { + "name": "minifloat", + "width": 8, + "exponent_width": 4, + "exponent_bias": None, + }, + "w": { + "name": "minifloat", + "width": 8, + "exponent_width": 4, + "exponent_bias": None, + }, + }, + "fp8-e3m4": { + "x": { + "name": "minifloat", + "width": 8, + "exponent_width": 3, + "exponent_bias": None, + }, + "w": { + "name": "minifloat", + "width": 8, + "exponent_width": 4, + "exponent_bias": None, + }, + }, + "mxint8": { + "x": {"name": "mxint", "width": 8, "block_size": 32, "block_axis": -1}, + "w": {"name": "mxint", "width": 8, "block_size": 32, "block_axis": -1}, + }, + "bm8": { + "x": { + "name": "block_minifloat", + "width": 8, + "exponent_width": 2, + "exponent_bias_width": 8, + "block_size": [16, 16], + "skip_first_dim": True, + }, + "w": { + "name": "block_minifloat", + "width": 8, + "exponent_width": 2, + "exponent_bias_width": 8, + "block_size": [16, 16], + "skip_first_dim": False, + }, + }, + "bl8": { + "x": {"name": "block_logarithm", "width": 8, "block_size": [16], "skip_first_dim": True}, + "w": {"name": "block_logarithm", "width": 8, "block_size": [16], "skip_first_dim": False}, + }, + "log8": { + "x": {"name": "logarithm", "width": 8}, + "w": {"name": "logarithm", "width": 8}, + }, + "bypass": None, +} + + +def build_default_q_config(q_name: str): + assert q_name in DEFAULT_Q_CONFIGS, f"Unknown quantizer name {q_name}" + if q_name == "bypass": + default_q_config = None + else: + default_q_config = { + "conv2d": { + "x": deepcopy(DEFAULT_Q_CONFIGS[q_name]["x"]), + "w": deepcopy(DEFAULT_Q_CONFIGS[q_name]["w"]), + }, + "linear": { + "x": deepcopy(DEFAULT_Q_CONFIGS[q_name]["x"]), + "w": deepcopy(DEFAULT_Q_CONFIGS[q_name]["w"]), + }, + } + + return default_q_config + + +def check_if_built_in_dtype(q_tag: str): + return q_tag in ["fp32", "fp16", "bf16"] + + +def quantize_model( + model: torch.nn.Module, + q_config: dict | str, + layers_to_ignore: list[str] = [], +) -> torch.nn.Module: + """Replace the layers in the model with quantized layers. + + **Note that the activations and weights are quantized at inference runtime**, + so the model state dict after quantization are still values in full precision instead of quantized values. + + :param model: FP32 model to be quantized. + :type model: torch.nn.Module + :param q_config: tag of the quantizer or a dictionary containing the quantizer configuration. + Supported quantizer tags: "fp32", "fp16", "bf16", "int8-dynamic", "fp8-e4m3", "fp8-e3m4", "mxint8", "bm8", "bl8", "log8", "bypass". + :type q_config: dict | str + :param layers_to_ignore: a list of regex expressions, which are layer name patterns to keep in full precision, defaults to [] + :type layers_to_ignore: list[str], optional + :return: Quantized model. + :rtype: torch.nn.Module + + + Example: + >>> import torch + >>> from torchvision.models import get_model + >>> from blackbox_locking.quantize import quantize_model + >>> model = get_model("resnet18", weights="DEFAULT").eval().cuda() + >>> model_q = quantize_model(model, "mxint8") + """ + if isinstance(q_config, str): + q_config = build_default_q_config(q_config) + quantized_layers = [] + for name, ori_layer in model.named_modules(): + matched = find_matched_pattern(name, layers_to_ignore) + if matched is not None: + # supports skipping layers like lm_head and cls_head + continue + if not isinstance(ori_layer, (torch.nn.Conv2d, torch.nn.Linear)): + # for now only conv2d and linear are supported + continue + if isinstance(ori_layer, torch.nn.Conv2d): + if check_if_built_in_dtype(q_config["conv2d"]["x"]["name"]) or check_if_built_in_dtype( + q_config["conv2d"]["w"]["name"] + ): + new_layer = wrap_forward_built_in_dtype(ori_layer, q_config["conv2d"]["x"]["name"]) + else: + new_layer = QConv2d( + ori_layer.in_channels, + ori_layer.out_channels, + ori_layer.kernel_size, + ori_layer.stride, + ori_layer.padding, + ori_layer.dilation, + ori_layer.groups, + ori_layer.bias is not None, + ori_layer.padding_mode, + q_config=q_config["conv2d"] if q_config is not None else None, + ) + elif isinstance(ori_layer, torch.nn.Linear): + if check_if_built_in_dtype(q_config["linear"]["x"]["name"]) or check_if_built_in_dtype( + q_config["linear"]["w"]["name"] + ): + new_layer = wrap_forward_built_in_dtype(ori_layer, q_config["linear"]["x"]["name"]) + else: + new_layer = QLinear( + ori_layer.in_features, + ori_layer.out_features, + ori_layer.bias is not None, + q_config=q_config["linear"] if q_config is not None else None, + ) + else: + raise RuntimeError(f"Unsupported layer type: {type(ori_layer)}") + + new_layer.load_state_dict(ori_layer.state_dict()) + new_layer.to(ori_layer.weight.device) + set_layer_by_name(model, name, new_layer) + + quantized_layers.append(name) + + logger.info(f"Quantized layers ({len(quantized_layers)}): {quantized_layers}") + return model diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/__init__.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b0477d1f879f2b7980d16f32f2e8c9098d31e5c8 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/__init__.py @@ -0,0 +1,34 @@ +from .integer import int_quantizer +from .minifloat import minifloat_ieee_quantizer +from .mxint import mxint_quantizer +from .block_minifloat import block_minifloat_quantizer +from .log import log_quantizer +from .block_log import block_log_quantizer +from .bypass import bypass_quantizer +from .torch_built_in import bf16_quantizer, fp16_quantizer, fp32_quantizer + + +def get_quantizer(name: str): + match name: + case "integer": + return int_quantizer + case "minifloat": + return minifloat_ieee_quantizer + case "block_minifloat": + return block_minifloat_quantizer + case "logarithm": + return log_quantizer + case "block_logarithm": + return block_log_quantizer + case "mxint": + return mxint_quantizer + case "bf16": + return bf16_quantizer + case "fp16": + return fp16_quantizer + case "fp32": + return fp32_quantizer + case "bypass": + return bypass_quantizer + case _: + raise ValueError(f"Unknown quantizer name {name}") diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/block_log.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/block_log.py new file mode 100644 index 0000000000000000000000000000000000000000..782bb7e115cb9fd3e211eb17daacea98b489a7c5 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/block_log.py @@ -0,0 +1,114 @@ +import torch +from numpy import ndarray +from torch import Tensor +from .log import _log_quantize + +from .utils import block, q_clamp, unblock + + +def _block_log_quantize( + x: Tensor | ndarray, + width: int, + exponent_bias_width: int = None, + block_size: list[int] | int = [16], + skip_first_dim: bool = False, +): + """ + Convert IEEE FP32/64 to block base-2 log quantized values. A bias is shared over each block + + --- + - forward: convert IEEE FP32/64 to base-2 log quantized values + - backward: This is not STE but close to STE because the derivate of (2**exponent) depends on the rounded exponent + + --- + - `width`: the number of bits, including 1 sign bit and (bits-1) exponent bits + - `exponent_bias_width`: the number of bits of shared exponent bias + - `block_size`: a list of integers where each integer is the block size along the corresponding dim + + """ + exponent_bits = width - 1 + x_shape_before_blocking = [i for i in x.shape] + blocked_x, per_block_max, padded_x_shape, block_shape = block( + x, block_shape=block_size, skip_first_dim=skip_first_dim + ) + + # fill zeros to avoid log2(0) = -inf + if torch.all(per_block_max == 0): + per_block_max = torch.ones_like(per_block_max) + else: + per_block_max[per_block_max == 0] = per_block_max[per_block_max != 0].min() + + per_block_max_exponent = torch.ceil(torch.log2(per_block_max)) + per_block_bias = q_clamp(2**exponent_bits - 1 - per_block_max_exponent, 0, 2**exponent_bias_width - 1) + + per_block_lq_x = _log_quantize(blocked_x, width=width, exponent_bias=per_block_bias) + lq_x = unblock( + per_block_lq_x, + x_shape_before_blocking=x_shape_before_blocking, + padded_x_shape=padded_x_shape, + block_shape=block_shape, + skipped_first_dim_when_blocking=skip_first_dim, + ) + + return lq_x + + +class BlockLogQuantize(torch.autograd.Function): + @staticmethod + def forward( + ctx, + x, + width: int, + exponent_bias_width: int = None, + block_size: list[int] | int = [16], + skip_first_dim: bool = False, + ): + return _block_log_quantize( + x, + width=width, + exponent_bias_width=exponent_bias_width, + block_size=block_size, + skip_first_dim=skip_first_dim, + ) + + @staticmethod + def backward(ctx, grad_output): + return grad_output, None, None, None, None + + +def block_log_quantizer( + x: Tensor, + width: int, + exponent_bias_width: int = 8, + block_size: list[int] | int = [16], + skip_first_dim: bool = False, +): + """ + Convert IEEE FP32/64 to block base-2 log quantized values. A bias is shared over each block + + --- + - forward: convert IEEE FP32/64 to base-2 log quantized values + - backward: This is not STE but close to STE because the derivate of (2**exponent) depends on the rounded exponent + + --- + - `width`: the number of bits, including 1 sign bit and (bits-1) exponent bits + - `exponent_bias_width`: the number of bits of shared exponent bias + - `block_size`: a list of integers where each integer is the block size along the corresponding dim + """ + ori_shape = x.size() + if len(ori_shape) > 2: + # a hack to support 4D/5D tensor + x = x.reshape(-1, *ori_shape[-1:]) + + x_q = BlockLogQuantize.apply( + x, + width, + exponent_bias_width, + block_size, + skip_first_dim, + ) + + if len(ori_shape) > 2: + x_q = x_q.reshape(ori_shape) + + return x_q diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/block_minifloat.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/block_minifloat.py new file mode 100644 index 0000000000000000000000000000000000000000..a18a09bc47273b4192b8834647e17c91ab919b19 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/block_minifloat.py @@ -0,0 +1,135 @@ +import torch +from torch import Tensor + +from .utils import block, q_clamp, unblock +from .minifloat import _minifloat_ieee_quantize + + +def _block_minifloat_quantize( + x: Tensor, + width: int, + exponent_width: int, + exponent_bias_width: int, + block_size: list[int] | int = [16], + skip_first_dim: bool = False, +): + """ + - Convert IEEE FP32/64 to Block Minifloat (BM), where an exponent bias is shared over all elements in a block + - `2**-bias_shared x [(-1)^s1 x 2^exponent1 x mantissa1, (-1)^s2 x 2^exponent2 x mantissa2, ...]` + - See https://openreview.net/forum?id=6zaTwpNSsQ2 + + --- + - forward: convert IEEE FP32/64 to BM + - backward: STE + + --- + - `width`: the number of bits (1 sign bit + exponent_bits + mantissa_bits) + - `exponent_width`: the number of exponent_bits + - `exponent_bias_width`: the number of bits of the shared exponent bias + - `block_size`: a list of integers where each integer is the block size on that dimension. See function `block`. + + """ + x_shape_before_blocking = [i for i in x.shape] + blocked_x, per_block_max, padded_x_shape, block_shape = block( + x, block_shape=block_size, skip_first_dim=skip_first_dim + ) + + # fill zeros to avoid log2(0) = -inf + if torch.all(per_block_max == 0): + per_block_max = torch.ones_like(per_block_max) + else: + per_block_max[per_block_max == 0] = per_block_max[per_block_max != 0].min() + + per_block_exponent_bias = q_clamp(torch.floor(torch.log2(per_block_max)), 0, 2**exponent_bias_width - 1) + per_block_bm_x = _minifloat_ieee_quantize( + blocked_x, + width=width, + exponent_width=exponent_width, + exponent_bias=per_block_exponent_bias, + ) + + bm_x = unblock( + per_block_bm_x, + x_shape_before_blocking=x_shape_before_blocking, + padded_x_shape=padded_x_shape, + block_shape=block_shape, + skipped_first_dim_when_blocking=skip_first_dim, + ) + return bm_x + + +class BlockMinifloatQuantize(torch.autograd.Function): + @staticmethod + def forward( + ctx, + x: Tensor, + width: int, + exponent_width: int, + exponent_bias_width: int, + block_size: list[int] | int = [16], + skip_first_dim: bool = False, + ): + return _block_minifloat_quantize( + x, + width=width, + exponent_width=exponent_width, + exponent_bias_width=exponent_bias_width, + block_size=block_size, + skip_first_dim=skip_first_dim, + ) + + @staticmethod + def backward( + ctx, + grad_output: Tensor, + width: int, + exponent_width: int, + exponent_bias_width: int, + block_size: list[int] | int = [16], + skip_first_dim: bool = False, + ): + return grad_output, None, None, None, None, None + + +def block_minifloat_quantizer( + x: Tensor, + width: int, + exponent_width: int, + exponent_bias_width: int, + block_size: list[int] | int = [16], + skip_first_dim: bool = False, +): + """ + - Convert IEEE FP32/64 to Block Minifloat (BM), where an exponent bias is shared over all elements in a block + - `2**-bias_shared x [(-1)^s1 x 2^exponent1 x mantissa1, (-1)^s2 x 2^exponent2 x mantissa2, ...]` + - See https://openreview.net/forum?id=6zaTwpNSsQ2 + + --- + - forward: convert IEEE FP32/64 to BM + - backward: STE + + --- + - `width`: the number of bits (1 sign bit + exponent_bits + mantissa_bits) + - `exponent_width`: the number of exponent_bits + - `exponent_bias_width`: the number of bits of the shared exponent bias + - `block_size`: a list of integers where each integer is the block size on that dimension. See function `block`. + + """ + ori_shape = x.size() + if len(ori_shape) > 2: + # a hack to support 4D/5D tensor + x = x.reshape(-1, *ori_shape[-1:]) + + x_q = BlockMinifloatQuantize.apply( + x, + width, + exponent_width, + exponent_bias_width, + block_size, + skip_first_dim, + ) + + if len(ori_shape) > 2: + x_q = x_q.reshape(*ori_shape) + + return x_q diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/bypass.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/bypass.py new file mode 100644 index 0000000000000000000000000000000000000000..94935f7efef18ea28a7b6dbcfde57551b3fd6c9e --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/bypass.py @@ -0,0 +1,2 @@ +def bypass_quantizer(x, *args, **kwargs): + return x diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/integer.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/integer.py new file mode 100644 index 0000000000000000000000000000000000000000..000226dd06ef72f7d352f648a2149fe2e45dfe20 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/integer.py @@ -0,0 +1,144 @@ +import logging +import torch + +from .utils import q_round, q_clamp + + +logger = logging.getLogger(__name__) + + +def int_find_scale(fp_min: float, fp_max: float, n_bits: int, is_affine): + if is_affine: + int_min = 0 + int_max = 2**n_bits - 1 + else: + int_min = -(2 ** (n_bits - 1)) + int_max = 2 ** (n_bits - 1) - 1 + fp_max = max(abs(fp_min), abs(fp_max)) + fp_min = -fp_max + + alpha = fp_min + beta = fp_max + alpha_q = int_min + beta_q = int_max + + scale = (beta - alpha) / (beta_q - alpha_q) + + zero_point = q_round( + (beta * alpha_q - alpha * beta_q) / (beta - alpha), mode="nearest" + ) + + return ( + scale, + zero_point, + int_min, + int_max, + ) + + +def int_find_fp_min_max(x: torch.Tensor, quantile: float): + x = x.flatten() + + n = x.numel() + k = min(int(n * quantile), n - 1) + + x_sorted, _ = x.sort() + + return x_sorted[k].item(), x_sorted[-k].item() + + +def int_quantize( + input: torch.Tensor, + scale: float, + zero_point: int, + int_min: int, + int_max: int, +): + input_q = q_clamp( + q_round(input / scale + zero_point, mode="nearest"), + min_val=int_min, + max_val=int_max, + ) + + return input_q + + +def int_dequantize( + input_q: torch.Tensor, + scale: float, + zero_point: int, +): + input_deq = scale * (input_q - zero_point) + return input_deq + + +def _int_quantizer( + input: torch.Tensor, + fp_min: float, + fp_max: float, + n_bits: int, + is_affine: bool, +): + scale, zero_point, int_min, int_max = int_find_scale( + fp_min, fp_max, n_bits, is_affine + ) + + input_q = int_quantize( + input, + scale, + zero_point, + int_min, + int_max, + ) + + input_deq = int_dequantize( + input_q, + scale, + zero_point, + ) + + return input_deq + + +class IntQuantize(torch.autograd.Function): + @staticmethod + def forward( + ctx, + x: torch.Tensor, + fp_min: float, + fp_max: float, + n_bits: int, + is_affine: bool, + ): + return _int_quantizer( + x, + fp_min, + fp_max, + n_bits, + is_affine, + ) + + @staticmethod + def backward(ctx, grad_output): + grad_input = grad_output.clone() + return grad_input, None, None, None, None + + +def int_quantizer( + x: torch.Tensor, + fp_min: float, + fp_max: float, + n_bits: int, + is_affine: bool, +): + if fp_min is None or fp_max is None: + # quantize and dequantize at runtime + fp_min = x.min().item() + fp_max = x.max().item() + return IntQuantize.apply( + x, + fp_min, + fp_max, + n_bits, + is_affine, + ) diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/log.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/log.py new file mode 100644 index 0000000000000000000000000000000000000000..e9492201c216cabfb7e6e3735efbcf66d7f534dc --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/log.py @@ -0,0 +1,74 @@ +import torch + +from numpy import ndarray +from torch import Tensor +import torch + +from .utils import q_clamp, q_round + + +def _log_quantize( + x: Tensor | ndarray, + width: int, + exponent_bias: int | Tensor | ndarray | None, +): + """ + - Use non-uniform, base-2 logarithmic representation to encode IEEE FP32/64 + - This quantisation scheme cannot represent 0. + + --- + - forward: convert IEEE FP32/64 to nearest base-2 log values + - backward: This is not STE but close to STE because the derivate of (2**exponent) depends on the rounded exponent + + --- + Currently, base-2 log representation takes the form (-1)**sign_bit * (2**exponent), + where exponent = intE - exponent_bias, and intE is the unsigned int represented by exponent bits + + --- + Refer to https://arxiv.org/pdf/1603.01025.pdf + """ + + exponent_bits = width - 1 + if exponent_bias in (None, "none", "None"): + exponent_bias = 2 ** (exponent_bits - 1) - 1 + + exponent_max = 2**exponent_bits - 1 - exponent_bias + exponent_min = -exponent_bias + min_pos = 2**exponent_min + + sign = torch.sign(x + min_pos * 0.1) + value = torch.abs(x) + min_pos * 0.1 + + exponent = q_clamp(q_round(torch.log2(value)), exponent_min, exponent_max) + + return sign * (2**exponent) + + +class LogQuantize(torch.autograd.Function): + @staticmethod + def forward(ctx, x: Tensor, width: int, exponent_bias: int | Tensor | ndarray | None): + return _log_quantize(x, width=width, exponent_bias=exponent_bias) + + @staticmethod + def backward(ctx, grad_output): + grad_input = grad_output.clone() + return grad_input, None, None + + +def log_quantizer( + x: Tensor | ndarray, + width: int, + exponent_bias: int | Tensor | ndarray | None = None, +): + """ + Convert IEEE FP32/64 to base-2 log quantized values + + --- + - forward: convert IEEE FP32/64 to base-2 log quantized values + - backward: This is not STE but close to STE because the derivate of (2**exponent) depends on the rounded exponent + + --- + - `width`: the number of bits, including 1 sign bit and (bits-1) exponent bits + - `exponent_bias`: the exponent bias + """ + return LogQuantize.apply(x, width, exponent_bias) diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/minifloat.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/minifloat.py new file mode 100644 index 0000000000000000000000000000000000000000..1be83425bac883caa2733b2c434d068d115eae0e --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/minifloat.py @@ -0,0 +1,131 @@ +import torch +from torch import Tensor + +from .utils import q_clamp, q_round + + +def _minifloat_ieee_quantize( + x: Tensor, + width: int, + exponent_width: int, + exponent_bias: int = None, + round_mode: str = "nearest", +): + """ + - Converts IEEE FP32/64 to minifloat with the implicit leading bit in mantissas. + - No representation for +/-inf or NaN. Large IEEE FP32/64 values will saturate. + + --- + - forward: convert IEEE FP32/64 to minifloat (mantissa has an implicit leading bit) + - backward: STE + + --- + width: the bit width of minifloat + exponent_width: the number of exponent bits in the minifloat + exponent_bias: the value of the exponent bias. If None, the default bias will be (2**exponent_bits - 1) >> 1. + + --- + For example: + a minifloat(bits=8, exponent_bits=4, mantissa_bits=3) number, + 1 0111 011, is equal to (-1)**1 * 2**(7-15) * (1+3/8) = -0.00537109375 + + --- + + Tested extreme cases: large values to saturate, small normal values, small subnormal values, normal precision, subnormal precision, and 0 + """ + mantissa_bits = width - exponent_width - 1 + + # set default bias + if exponent_bias in (None, "none", "None"): + exponent_bias = 2 ** (exponent_width - 1) - 1 + # upper and lower bound of shifted exponent + exponent_max = 2**exponent_width - 1 - exponent_bias + exponent_min = -exponent_bias + # upper and lower bound of shifted minifloat mantissa + shift = 2**mantissa_bits + shifted_mantissa_max = 2**mantissa_bits - 1 + shifted_mantissa_min = 0 + + sign = torch.sign(x + 1e-9) + + value = torch.abs(x) + # clip the exponent before calculating mantissa + exponent = torch.floor(torch.log2(value + 1e-9)) + exponent = q_clamp(exponent, exponent_min, exponent_max) + + mantissa = value / 2**exponent + + shift = 2**mantissa_bits + # fmt: off + # if the clipped exponent is zero, the minifloat is in a subnormal form + # this `is_normal` also help the grad keeps 1 if input x is 0, or the zero-initialized value will be trapped in 0 + if isinstance(exponent_bias, (int, float)): + exponent_bias = torch.tensor([exponent_bias], dtype=exponent.dtype, device=exponent.device) + is_normal = (~torch.isclose(exponent, -exponent_bias)) + + shifted_mantissa = is_normal*q_clamp(q_round(mantissa*shift-shift, mode=round_mode), shifted_mantissa_min, shifted_mantissa_max) +\ + (~is_normal)*q_clamp(q_round(mantissa*shift/2, mode=round_mode), shifted_mantissa_min, shifted_mantissa_max) + mantissa = is_normal*(1.0+shifted_mantissa/shift) + (~is_normal)*(shifted_mantissa/shift*2) + # this `is_close_to_0` helps the grad keeps 1 if input x is 0, or the zero-initialized value will be trapped in 0 + is_close_to_0 = torch.isclose(value, torch.tensor([0.0], dtype=value.dtype, device=value.device)) + minifloat_ieee_x = (~is_close_to_0)*(sign * (2**exponent) * mantissa) + is_close_to_0*x + # fmt: on + return minifloat_ieee_x + + +class MinifloatIEEEQuantize(torch.autograd.Function): + @staticmethod + def forward( + ctx, + x: Tensor, + width: int, + exponent_width: int, + exponent_bias: int = None, + round_mode: str = "nearest", + ): + return _minifloat_ieee_quantize( + x, + width=width, + exponent_width=exponent_width, + exponent_bias=exponent_bias, + round_mode=round_mode, + ) + + @staticmethod + def backward(ctx, grad_output): + grad_input = grad_output.clone() + return grad_input, None, None, None, None + + +def minifloat_ieee_quantizer( + x: Tensor, + width: int, + exponent_width: int, + exponent_bias: int = None, + round_mode: str = "nearest", +): + """ + - Converts IEEE FP32/64 to minifloat with the implicit leading bit in mantissas. + - No representation for +/-inf or NaN. Large IEEE FP32/64 values will saturate. + + --- + - forward: convert IEEE FP32/64 to minifloat (mantissa has an implicit leading bit) + - backward: STE + + --- + width: the bit width of minifloat + exponent_width: the number of exponent bits in the minifloat + exponent_bias: the value of the exponent bias. If None, the default bias will be (2**exponent_bits - 1) >> 1. + + --- + For example: + a minifloat(bits=8, exponent_bits=4, mantissa_bits=3) number, + 1 0111 011, is equal to (-1)**1 * 2**(7-15) * (1+3/8) = -0.00537109375 + + --- + + Tested extreme cases: large values to saturate, small normal values, small subnormal values, normal precision, subnormal precision, and 0 + """ + return MinifloatIEEEQuantize.apply( + x, width, exponent_width, exponent_bias, round_mode + ) diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/mxint.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/mxint.py new file mode 100644 index 0000000000000000000000000000000000000000..662910b0cd8aebe0848fe94193ef9835efa774fb --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/mxint.py @@ -0,0 +1,216 @@ +import torch + + +def group_tensor( + x: torch.Tensor, block_size: int, block_axis: int +) -> tuple[torch.Tensor, tuple, tuple]: + """Group the elements into blocks along the specified axis. + - Only support 1D, 2D, or 3D tensor. + - When x is 3D tensor, cannot group along batch axis (block_axis=0). + - Use the view and permute to restore grouped x to the original shape. + + :param torch.Tensor x: 1D, 2D, or 3D tensor + :param int block_size: number of elements in each block + :param int block_axis: Group the elements into blocks along the specified axis + :raises ValueError: illegal block_axis + :raises NotImplementedError: illegal tensor dimension and shape + :return tuple[torch.Tensor, tuple, tuple]: grouped tensor, view_args, permute_args + + .. code-block:: python + + >>> x = torch.arange(12).reshape(3, 4) + >>> block_size = 2 + >>> block_axis = -1 + >>> print(x) + tensor([[ 0, 1, 2, 3], + [ 4, 5, 6, 7], + [ 8, 9, 10, 11]]) + >>> x, view_args, permute_args = _group_tensor(x, block_size, block_axis) + >>> print(x) + tensor([[ 0, 1], + [ 2, 3], + [ 4, 5], + [ 6, 7], + [ 8, 9], + [10, 11]]) + >>> print(x.view(view_args).permute(permute_args)) + tensor([[ 0, 1, 2, 3], + [ 4, 5, 6, 7], + [ 8, 9, 10, 11]]) + """ + + if block_axis < 0: + block_axis = x.ndim + block_axis + + ori_shape = x.size() + if x.ndim == 1: + return x.reshape(-1, block_size), ori_shape, (0,) + elif x.ndim == 2: + if block_axis == 0: + permute_args = (1, 0) + x = x.permute(1, 0) + view_args = x.size() + x = x.contiguous() + return x.reshape(-1, block_size), view_args, permute_args + else: + permute_args = (0, 1) + return x.view(-1, block_size), ori_shape, permute_args + elif x.ndim == 3: + if block_axis == 1: + permute_args = (0, 2, 1) + x = x.permute(0, 2, 1) + view_args = x.size() + x = x.contiguous() + return x.reshape(-1, block_size), view_args, permute_args + elif block_axis == 2: + permute_args = (0, 1, 2) + view_args = x.size() + return x.reshape(-1, block_size), view_args, permute_args + else: + raise ValueError("cannot group along batch axis for 3D tensor") + else: + raise NotImplementedError( + "Only support 1D, 2D tensor, and 3D activation tensor" + ) + + +def pad_zeros_if_necessary( + x: torch.Tensor, block_size: int, block_axis: int +) -> torch.Tensor: + """Append zeros to x if the number of elements along block_axis is not a multiple of block_size, else return x. + + :param torch.Tensor x: input tensor + :param int block_size: number of elements in each block + :param int block_axis: group the elements into blocks along the specified axis + :return torch.Tensor: padded tensor + """ + + if x.shape[block_axis] % block_size == 0: + return x + + pad_size = block_size - x.shape[block_axis] % block_size + pad_shape = list(x.shape) + pad_shape[block_axis] = pad_size + pad = torch.zeros(pad_shape, dtype=x.dtype, device=x.device) + x = torch.cat([x, pad], dim=block_axis) + return x + + +def _check_shape_mxint(x: torch.Tensor, block_size: int, block_axis: int): + assert x.ndim >= 1, "x must have at least 1 dimension" + # assert ( + # x.shape[block_axis] % block_size == 0 + # ), f"block_size (={block_size}) must divide the number of elements along block_axis (= {x.shape[block_axis]})" + + if x.ndim == 1: + assert block_axis in [0, -1], "block_axis must be 0 or -1 for 1D tensor" + elif x.ndim == 2: + assert block_axis in [ + 0, + 1, + -1, + -2, + ], "block_axis must be 0, 1, -1, or -2 for 2D tensor" + elif x.ndim == 3: + assert block_axis != 0, "cannot group along batch axis for 3D tensor" + assert block_axis in [ + 1, + 2, + -2, + -1, + ], "block_axis must be 1, 2, -2, or -1 for 3D tensor" + else: + raise NotImplementedError( + "Only support 1D, 2D tensor, and 3D activation tensor" + ) + return True + + +def _mxint_quantizer( + x: torch.Tensor, width: int, block_size: int, block_axis: int +) -> torch.Tensor: + ori_type = x.dtype + assert ori_type in [torch.float32, torch.float16, torch.bfloat16, torch.float64] + x = x.to(torch.float32) + assert width <= 8 and width >= 2 + assert _check_shape_mxint(x, block_size, block_axis) + + ori_shape = x.size() + # group the elements into blocks along the specified axis + x = pad_zeros_if_necessary(x, block_size, block_axis) + x, view_args, permute_args = group_tensor(x, block_size, block_axis) + + sign = x < 0 + + # set subnormal numbers to 0 + x = x.abs() + is_normal = x >= torch.finfo(torch.bfloat16).smallest_normal + x = torch.where(is_normal, x, 0.0) + + is_zeros = torch.all(x == 0.0, dim=1, keepdim=True) + # extract exponent + exponent = (x.view(dtype=torch.int32) >> 23) & 0xFF + + # use the max exponent as the shared scale + group_max_exp = exponent.max(dim=1, keepdim=True).values + group_max_exp = torch.where(is_zeros, 1, group_max_exp) + group_max_exp = (group_max_exp << 23).view(torch.float32) + + # elements after the shared scale is extracted + x = x / group_max_exp + + # round the elements to the nearest fixed-point number + # note that the element of the MXINT has 1 sign bit, and (width - 1) bits for the mantissa + # the radix point is after the first bit of the mantissa. + # for example, the mantissa of MXINT8 follows the form: _ . _ _ _ _ _ _ , i.e., 1-bit before the radix point, 6-bit after the radix point + x = x * (2 ** (width - 2)) + x = x.round().clamp(0, 2 ** (width - 1) - 1) + + x = x * group_max_exp / (2 ** (width - 2)) + x = torch.where(sign, -x, x) + + # restore x to the original shape + x = x.view(view_args).permute(permute_args) + # if len(ori_shape) == n, then slice x to ori_shape by x[:ori_shape[0], :ori_shape[1], ..., :ori_shape[n-1]] + x = x[tuple(slice(ori_shape[i]) for i in range(len(ori_shape)))] + + x = x.to(ori_type) + return x + + +class MXINTQuantize(torch.autograd.Function): + @staticmethod + def forward( + ctx, + x: torch.Tensor, + width: int, + block_size: int, + block_axis: int, + ): + return _mxint_quantizer(x, width, block_size, block_axis) + + @staticmethod + def backward(ctx, grad_output): + grad_input = grad_output.clone() + return grad_input, None, None, None + + +def mxint_quantizer(x: torch.Tensor, width: int, block_size: int, block_axis: int): + """Emulated quantizer from bfloat16 to mxint8. + + :param torch.Tensor x: torch.bfloat16 tensor + :param int block_size: number of elements in each block + :param int block_axis: group the elements into blocks along the specified axis + :return torch.Tensor: emulated mxint tensor with the same shape as x, dtype=torch.bfloat16 + """ + ori_shape = x.size() + if len(ori_shape) > 3: + # a hack to support 4D/5D tensor + x = x.view(-1, *ori_shape[-2:]) + + x_q = MXINTQuantize.apply(x, width, block_size, block_axis) + + if len(ori_shape) > 3: + x_q = x_q.reshape(*ori_shape) + + return x_q diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/torch_built_in.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/torch_built_in.py new file mode 100644 index 0000000000000000000000000000000000000000..955a17fc11a84e48af38fac21b0576651d67c82d --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/torch_built_in.py @@ -0,0 +1,19 @@ +import logging +import torch + +logger = logging.getLogger(__name__) + +def bf16_quantizer(x: torch.Tensor): + ori_dtype = x.dtype + x = x.to(torch.bfloat16).to(ori_dtype) + return x + +def fp16_quantizer(x: torch.Tensor): + ori_dtype = x.dtype + x = x.to(torch.float16).to(ori_dtype) + return x + +def fp32_quantizer(x: torch.Tensor): + ori_dtype = x.dtype + x = x.to(torch.float32).to(ori_dtype) + return x \ No newline at end of file diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/utils.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8ed9f32dccf3786c6f8ec5c233588cc53f6817b2 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/quantizers/utils.py @@ -0,0 +1,330 @@ +import math +import torch +from torch.autograd.function import InplaceFunction +import torch.nn.functional as F + + +class QClamp(InplaceFunction): + @staticmethod + def forward(ctx, input: torch.Tensor, min_val, max_val): + ctx.min_val = min_val + ctx.max_val = max_val + return input.clamp(min_val, max_val) + + @staticmethod + def backward(ctx, grad_output): + return grad_output, None, None + + +class QRound(InplaceFunction): + @staticmethod + def forward(ctx, input: torch.Tensor, mode="nearest"): + ctx.mode = mode + match mode: + case "nearest": + return torch.round(input) + case "truncate": + return torch.trunc(input) + case "ceil": + return torch.ceil(input) + case "floor": + return torch.floor(input) + case _: + raise ValueError(f"Invalid mode: {mode}") + + @staticmethod + def backward(ctx, grad_output): + return grad_output, None + + +def q_clamp(input: torch.Tensor | float, min_val: float, max_val: float): + if isinstance(input, float): + return min(max(input, min_val), max_val) + elif isinstance(input, torch.Tensor): + return QClamp.apply(input, min_val, max_val) + else: + raise ValueError(f"Invalid input type: {type(input)}") + + +def q_round(input: torch.Tensor | float, mode="nearest"): + assert mode in ["nearest", "truncate", "ceil", "floor"] + if isinstance(input, float): + match mode: + case "nearest": + return round(input) + case "truncate": + return math.trunc(input) + case "ceil": + return math.ceil(input) + case "floor": + return math.floor(input) + elif isinstance(input, torch.Tensor): + return QRound.apply(input, mode) + else: + raise ValueError(f"Invalid input type: {type(input)}") + + +# -------------------------------- +# Block and unblock +# -------------------------------- + + +def _infer_block_shape(x_shape: list[int], block_shape: list[int]): + """ + Infer a reasonable block shape. + - right align block_shape with x_shape, + 1. If len(block_shape) > len(x_shape), truncate redundant block_shape dims. + 2. If block_shape.ndim < x_shape.ndim, prepend -1 to block_shape until block_shape.ndim == x_shape.ndim + - if block_shape[i] < x_shape[i], inferred_block_shape[i] = block_shape[i] + - if block_shape[i] >= x_shape[i], inferred_block_shape[i] = x_shape[i] + - if block_shape[i] == -1, inferred_block_shape[i] = x_shape[i] + """ + x_ndim = len(x_shape) + block_ndim = len(block_shape) + + if block_ndim >= x_ndim: + inferred_block_shape = block_shape[-x_ndim:] + else: + inferred_block_shape = [-1] * (x_ndim - block_ndim) + block_shape + for dim_i in range(x_ndim): + if inferred_block_shape[dim_i] == -1 or inferred_block_shape[dim_i] > x_shape[dim_i]: + inferred_block_shape[dim_i] = x_shape[dim_i] + else: + inferred_block_shape[dim_i] = inferred_block_shape[dim_i] + return inferred_block_shape + + +def _infer_padding_shape(x_shape: list[int], block_shape: list[int]): + """ + Calculate paddings to make x_shape[i] divisable by block_shape[i] + """ + pad_diff = [] + for x_shape_dim_i, block_shape_dim_i in zip(x_shape, block_shape): + if block_shape_dim_i == -1 or x_shape_dim_i < block_shape_dim_i: + pad_diff += [0, 0] + else: + num_blocks_dim_i = math.ceil(x_shape_dim_i / block_shape_dim_i) + new_x_dim_i = num_blocks_dim_i * block_shape_dim_i + pad_diff += [new_x_dim_i - x_shape_dim_i, 0] + pad_diff = pad_diff[::-1] + return pad_diff + + +def _block_1d_bias(x: torch.Tensor, block_shape: list[int]): + """ + bias shape: [output_features] -> [num_blocks, block_size] + + The bias of nn.Linear, nn.Conv1d, and nn.Conv2d are all 1D tensors + + --- + x: a bias with bias.ndim == 1 + """ + assert x.ndim == 1 + x_shape = [i for i in x.shape] + block_shape = _infer_block_shape(x_shape, block_shape) + pad_diff = _infer_padding_shape(x_shape, block_shape) + padded_x = F.pad(x, pad_diff) + padded_x_shape = torch.tensor(padded_x.shape, dtype=torch.int) + blocked_x = padded_x.reshape(padded_x_shape[0] // block_shape[0], block_shape[0]) + per_block_max = torch.abs(blocked_x).max(dim=1, keepdim=True)[0] + + return blocked_x, per_block_max, padded_x_shape, block_shape + + +def _unblock_to_1d_bias( + blocked_x: torch.Tensor, + x_shape_before_blocking: list[int], +): + """ + blocked bias shape: [num_blocks, block_size] -> [output_features] + + --- + blocked x: blocked bias with blocked_bias.ndim == 2 + """ + x = blocked_x.flatten() + + indexes = [] + for i in range(len(x_shape_before_blocking)): + indexes.append(slice(None, x_shape_before_blocking[i])) + # print(f"indexes: {indexes}") + x = x[indexes] + return x + + +def _block_2d_activation(x: torch.Tensor, block_shape: list[int]): + """ + [batch_size, hidden_size] -> [batch_size, num_blocks, block_size[-1]] + """ + assert x.ndim == 2 + x_shape = [i for i in x.shape] + one_batch_shape = [1, x_shape[1]] + block_shape = _infer_block_shape(one_batch_shape, block_shape=block_shape) + pad_diff = _infer_padding_shape(x_shape, block_shape=block_shape) + padded_x = F.pad(x, pad_diff) + padded_x_shape = torch.tensor(padded_x.shape, dtype=torch.int) + # [batch_size, hidden_size] -> [batch_size, num_blocks, block_size[-1]] + blocked_x = padded_x.reshape(x_shape[0], padded_x_shape[1] // block_shape[-1], block_shape[-1]) + per_block_max = torch.abs(blocked_x).max(dim=2, keepdim=True)[0] + + return blocked_x, per_block_max, padded_x_shape, block_shape + + +def _unblock_to_2d_activation(blocked_x: torch.Tensor, x_shape_before_blocking: list[int]): + """ + [batch_size, num_blocks, block_size] -> [batch_size, hidden_size] + """ + x = blocked_x.flatten(1) + + indexes = [] + for i in range(len(x_shape_before_blocking)): + indexes.append(slice(None, x_shape_before_blocking[i])) + # print(f"indexes: {indexes}") + x = x[indexes] + return x + + +def _block_2d_weight(x: torch.Tensor, block_shape: list[int]): + """ + [in_features, out_features] -> [block_size_0 * block_size_1, num_blocks] + + """ + assert x.ndim == 2 + x_shape = [i for i in x.shape] + block_shape = _infer_block_shape(x_shape, block_shape) + pad_diff = _infer_padding_shape(x_shape, block_shape) + padded_x = F.pad(x, pad_diff) + padded_x_shape = torch.tensor(padded_x.shape, dtype=torch.int) + + padded_x = padded_x.unsqueeze(0).unsqueeze(0) + # [1, 1, in_features, out_features] -> [1, block_size_0 * block_size_1, num_blocks] + blocked_x = F.unfold(padded_x, kernel_size=block_shape, dilation=1, padding=0, stride=block_shape) + + # [1, block_size_0 * block_size_1, num_blocks] -> [block_size_0 * block_size_1, num_blocks] + blocked_x = blocked_x.squeeze(0) + per_block_max = torch.abs(blocked_x).max(dim=0, keepdim=True)[0] + + return blocked_x, per_block_max, padded_x_shape, block_shape + + +def _unblock_to_2d_weight(blocked_x: torch.Tensor, x_shape_before_blocking, padded_x_shape, block_shape): + """ + [block_size_0 * block_size_1, num_blocks] -> [in_features, out_features] + """ + # [block_size_0 * block_size_1, num_blocks] -> [1, padded_x_shape[0], padded_x_shape[1]] + x = F.fold( + blocked_x, + output_size=padded_x_shape, # [padded_in_features, padded_out_features] + kernel_size=block_shape, # [block_shape_0, block_shape_1] + dilation=1, + padding=0, + stride=block_shape, + ) + + x = x.squeeze(0) + indexes = [] + for i in range(len(x_shape_before_blocking)): + indexes.append(slice(None, x_shape_before_blocking[i])) + x = x[indexes] + # print(f"indexes: {indexes}") + return x + + +def _block_3d_activation(x: torch.Tensor, block_shape: list[int]): + """ + [batch_size, hidden_dim_0, hidden_dim_1] -> [batch_size, block_size_0 * block_size_1, num_blocks] + + --- + Return blocked_x, per_block_max, padded_x_shape, block_shape + """ + assert x.ndim == 3 + x_shape = [i for i in x.shape] + one_batch_shape = [1, *x_shape[1:]] + block_shape = _infer_block_shape(one_batch_shape, block_shape) # [1, ...] + pad_diff = _infer_padding_shape(one_batch_shape, block_shape) + padded_x = F.pad(x, pad_diff) + padded_x_shape = torch.tensor(padded_x.shape, dtype=torch.int) + padded_x = padded_x.unsqueeze(1) + # [batch_size, 1, num_tokens, hidden_size] -> [batch_size, block_size_0 * block_size_1, num_blocks] + blocked_x = F.unfold( + padded_x, + kernel_size=block_shape[1:], + dilation=1, + padding=0, + stride=block_shape[1:], + ) + + per_block_max = torch.abs(blocked_x).max(dim=1, keepdim=True)[0] + + return blocked_x, per_block_max, padded_x_shape, block_shape + + +def _unblock_to_3d_activation(blocked_x: torch.Tensor, x_shape_before_blocking, padded_x_shape, block_shape): + # [batch_size, block_size_0 * block_size_1, num_blocks] -> [batch_size, 1, padded_x_shape_1, padded_x_shape_2] + x = F.fold( + blocked_x, + output_size=padded_x_shape[1:], + kernel_size=block_shape[1:], + dilation=1, + padding=0, + stride=block_shape[1:], + ) + x = x.squeeze(1) + indexes = [] + for i in range(len(x_shape_before_blocking)): + indexes.append(slice(None, x_shape_before_blocking[i])) + x = x[indexes] + # print(f"indexes: {indexes}") + return x + + +def block(x: torch.Tensor, block_shape: list[int], skip_first_dim: bool = False): + """ + - skip_first_dim (bool): If True, block_shape[0] will always take 1. + + --- + Return (blocked_x, per_block_max, padded_x_shape, block_shape) + """ + if x.ndim == 1: + assert skip_first_dim is False, "skip_first_dim must be False for bias to be blocked" + return _block_1d_bias(x, block_shape) + elif x.ndim == 2: + if skip_first_dim: + return _block_2d_activation(x, block_shape) + else: + return _block_2d_weight(x, block_shape) + elif x.ndim == 3: + if skip_first_dim: + return _block_3d_activation(x, block_shape) + else: + raise NotImplementedError("block 3d weight is not supported.") + else: + raise RuntimeError(f"Unsupported x.ndim = {x.ndim}") + + +def unblock( + blocked_x: torch.Tensor, + x_shape_before_blocking: list[int], + padded_x_shape, + block_shape: list[int], + skipped_first_dim_when_blocking: bool = True, +): + if len(x_shape_before_blocking) == 1: + assert skipped_first_dim_when_blocking is False, "first dim of bias can not have been skipped in blocking" + return _unblock_to_1d_bias(blocked_x, x_shape_before_blocking) + elif len(x_shape_before_blocking) == 2: + if skipped_first_dim_when_blocking: + return _unblock_to_2d_activation(blocked_x, x_shape_before_blocking) + else: + return _unblock_to_2d_weight( + blocked_x, + x_shape_before_blocking, + padded_x_shape, + block_shape, + ) + elif len(x_shape_before_blocking) == 3: + if skipped_first_dim_when_blocking: + return _unblock_to_3d_activation(blocked_x, x_shape_before_blocking, padded_x_shape, block_shape) + else: + raise NotImplementedError("unblock to 3d weight is not supported") + else: + raise RuntimeError("Unsupported n.dims ({}) to unblock back".format(len(x_shape_before_blocking))) diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/utils.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5efc409ac4a5ad639ed069c5a87bae2e35263df8 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/quantize/utils.py @@ -0,0 +1,46 @@ +import re + + +def get_layer_name(module, layer): + # get the name of the op relative to the module + for name, m in module.named_modules(): + if m is layer: + return name + raise ValueError(f"Cannot find op {layer} in module {module}") + + +def get_layer_by_name(module, layer_name): + # get the op by its name relative to the module + for name, m in module.named_modules(): + if name == layer_name: + return m + raise ValueError(f"Cannot find op {layer_name} in module {module}") + + +def find_matched_pattern(query: str, patterns: list[str]) -> str | None: + patterns: list[re.Pattern] = [re.compile(pattern) for pattern in patterns] + + matched_patterns = [] + + for pattern in patterns: + if pattern.fullmatch(query): + matched_patterns.append(pattern) + + if len(matched_patterns) > 1: + raise ValueError(f"Multiple patterns matched: {matched_patterns}") + + return matched_patterns[0].pattern if len(matched_patterns) == 1 else None + + +def set_layer_by_name(module, name, new_layer): + levels = name.split(".") + if len(levels) > 1: + mod_ = module + for l_idx in range(len(levels) - 1): + if levels[l_idx].isdigit(): + mod_ = mod_[int(levels[l_idx])] + else: + mod_ = getattr(mod_, levels[l_idx]) + setattr(mod_, levels[-1], new_layer) + else: + setattr(module, name, new_layer) diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/search/__init__.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/search/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bdcc74be3525df669ac62ff143f1c8794b5da4e7 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/search/__init__.py @@ -0,0 +1,9 @@ +# from .search import ( +# search_quantisation_for_cls_runner, +# search_quantisation_for_prompting_cls_runner, +# ) + +from .search import SearchRunner + # SearchQuantisationForPromptingCLS) +# from .search_conditional import (SearchIntQuantisationForClassification, +# SearchIntQuantisationForPromptingCLS) diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/search/_search.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/search/_search.py new file mode 100644 index 0000000000000000000000000000000000000000..942ed064b1a646d8515c7be016ae6a4cd76016a6 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/search/_search.py @@ -0,0 +1,349 @@ +import logging +import gc +from pathlib import Path +import json +import ast + +import torch +import joblib + +import optuna +from optuna.pruners import BasePruner +import wandb +from accelerate import ( + infer_auto_device_map, + init_empty_weights, + load_checkpoint_and_dispatch, + dispatch_model, +) + +import sys +import optuna +import copy +from contextlib import contextmanager, redirect_stderr, redirect_stdout +from os import devnull + + +# hack + +from ..hf_model_map import get_config_cls, get_model_cls, get_tokenizer_cls +from ..utils import load_config + +# temporary +from ..models import quantize_transformer, get_cost + +import os +from tqdm import tqdm + +sys.path.append( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "..", + "..", + "lm-evaluation-harness", + ) +) + +from lm_eval.evaluator import simple_evaluate +from lm_eval.models.huggingface import HFLM + +optuna.logging.set_verbosity(optuna.logging.ERROR) +logger = logging.getLogger(__name__) + + +def sum_all_entries(flops): + # recursive sum + value = 0 + for k, v in flops.items(): + if isinstance(v, dict): + value += sum_all_entries(v) + elif isinstance(v, (int, float)): + value += v + return value + + +@contextmanager +def silent_harness_evaluator(): + with open(devnull, "w") as fnull: + with redirect_stdout(fnull) as out: + yield out + + +def my_dispatch_model(model, device, device_map): + if device_map: + model = dispatch_model(model, device_map) + else: + model.to(device) + return model + + +class DuplicateIterationPruner(BasePruner): + """ + DuplicatePruner + + Pruner to detect duplicate trials based on the integer list of trial.user_attrs["layers"]. + + This pruner is used to identify and prune trials that have the same set of parameters + as a previously completed trial. + """ + + def prune(self, study: "optuna.study.Study", trial: "optuna.trial.FrozenTrial") -> bool: + completed_trials = study.get_trials(states=[optuna.trial.TrialState.COMPLETE]) + + # construct a list of layers from the trial params + trial_layers = [] + # I hope the params are ordered by the order they were added, otherwise this will break the list assumption + for i in range(trial.params["num_hidden_layers"]): + v = trial.params[f"layer_{i}"] + layer = 1 if v < trial.params["ratio"] else 0 + trial_layers.append(layer) + + for completed_trial in completed_trials: + # reconstruct the layers list from the trial params + completed_trial_layers = [] + for i in range(completed_trial.params["num_hidden_layers"]): + v = completed_trial.params[f"layer_{i}"] + layer = 1 if v < completed_trial.params["ratio"] else 0 + completed_trial_layers.append(layer) + + # compare the two constructed layers lists to see if they are the same + if completed_trial_layers == trial_layers: + return True + + return False + + +class SearchBase: + def __init__( + self, + model_arch: str, + model_name: str, + search_config: dict | str, + save_dir: str, + device: str, + model_parallel: bool = False, + enable_wandb: bool = True, + local_hf_cache: str = None, + ) -> None: + self.model_arch = model_arch + self.model_name = model_name + self.model_cls = get_model_cls(model_arch, "lm") + self.local_hf_cache = local_hf_cache + self.config_cls = get_config_cls(model_arch) + self.tokenizer = get_tokenizer_cls(model_arch).from_pretrained(model_name, legacy=False) + self.model_config = self.config_cls.from_pretrained(model_name) + self.device = device + self.model_parallel = model_parallel + self.enable_wandb = enable_wandb + + self.search_config = search_config if isinstance(search_config, dict) else load_config(search_config) + + self.save_dir = Path(save_dir) + self._create_logger() + self.load_model_from_hf() + + def _create_logger(self): + self.save_dir.mkdir(parents=True, exist_ok=True) + logger = logging.getLogger(type(self).__name__) + logger.setLevel(logging.INFO) + fh = logging.FileHandler(self.save_dir / "search_log.csv") + fh.setLevel(logging.INFO) + logger.addHandler(fh) + self.logger = logger + self.logger.info("trial_number,avg_acc,saved_cost_ratio,cost,layers") + + def load_model_from_hf(self): + config = self.config_cls.from_pretrained(self.model_name) + config.use_cache = False + + self.device_map = None + + if self.local_hf_cache: + model = self.model_cls.from_pretrained( + self.model_name, config=config, local_files_only=True, cache_dir=self.local_hf_cache + ) + else: + model = self.model_cls.from_pretrained(self.model_name, config=config) + + if "cuda" in self.device and not self.model_parallel: + pass + elif "cuda" in self.device and self.model_parallel: + if hasattr(model, "tie_weights"): + model.tie_weights() + if "device_map" not in self.search_config["setup"] or self.search_config["setup"]["device_map"] == "auto": + device_map = infer_auto_device_map( + model, + no_split_module_classes=model._no_split_modules, + ) + logger.info("Using 'auto' device map.") + else: + device_map = ast.literal_eval(self.search_config["setup"]["device_map"]) + logger.info(f"device_map:\n{device_map}") + self.device_map = device_map + elif self.device == "cpu": + pass + else: + raise ValueError(f"Unknown device: {self.device}") + self.model = model + + +class SearchRunner(SearchBase): + """ + Perform quantisation search for bert-like models on classification tasks + - Bert-like model refers to a network consisting of the base model and a classifier head, already fine-tuned on downstream tasked. + - This class calls a evaluation function to get the results on glue tasks + """ + + def __init__( + self, + model_arch: str, + model_name: str, + seq_len: int, + search_config: dict | str, + save_dir: str, + device: str, + model_parallel: bool = False, + enable_wandb: bool = True, + local_hf_cache: str = None, + ) -> None: + super().__init__( + model_arch, + model_name, + search_config, + save_dir, + device, + model_parallel, + enable_wandb, + local_hf_cache, + ) + + self.tasks = self.search_config["setup"]["tasks"] + self.batch_size = self.search_config["setup"]["batch_size"] + self.limit = self.search_config["setup"]["num_samples_per_trial"] + self.num_trials = self.search_config["setup"]["num_trials"] + self.ratio = self.search_config["setup"]["ratio"] + self.randomnize_search = self.search_config["setup"]["random"] + self.seq_len = seq_len + + model_fp16 = copy.deepcopy(self.model) + model_fp16, config, flops = quantize_transformer(model_arch, model_fp16, self.search_config, [], seq_len) + + # convert to fp16 + model_fp16 = model_fp16.to(dtype=torch.float16) + + base_results = self.eval_task(my_dispatch_model(model_fp16, self.device, self.device_map), self.batch_size) + del model_fp16 + gc.collect() + base_acc = {} + for task in base_results["results"]: + base_acc[task] = base_results["results"][task]["acc"] + base_acc["avg_acc"] = sum(base_acc.values()) / len(base_acc) + base_bit_ops = sum_all_entries(flops) * 16 * 16 + self.base_bit_ops = base_bit_ops # for calculating delta cost + + logger.info("==================== FP16 Evaluation ====================") + base_eval = dict(base_accs=base_acc, base_bit_ops=base_bit_ops) + dumped = json.dumps(base_eval, indent=2) + logger.info(dumped) + with open(self.save_dir / "fp16_baseline.json", "w") as f: + f.write(dumped) + + # keep base model on cpu + self.model.to("cpu") + + logger.info("==================== Model initialized ====================") + + def eval_task( + self, + model, + batch_size: int, + ): + with silent_harness_evaluator(): + results = simple_evaluate( + model=HFLM(model), + tasks=self.tasks, + # fixme + batch_size=batch_size, + limit=self.limit, + ) + return results + + def search(self): + if self.randomnize_search: + study = optuna.create_study( + # directions=["maximize", "maximize"] + direction="maximize", + sampler=optuna.samplers.RandomSampler(), + pruner=DuplicateIterationPruner(), + ) # avg_acc, cost + else: + study = optuna.create_study( + # directions=["maximize", "maximize"] + direction="maximize", + pruner=DuplicateIterationPruner(), + ) # avg_acc, cost + + study.optimize(self.layer_wise_search, n_trials=self.num_trials, show_progress_bar=True) + + df = study.trials_dataframe() + df.to_pickle(self.save_dir / "search_results.pkl") + if self.enable_wandb: + wandb_table = wandb.Table(data=df) + wandb.log({"search_results": wandb_table}) + + best_trial_ids = [t.number for t in study.best_trials] + best_trial_df = df[df["number"].isin(best_trial_ids)] + best_trial_df.to_pickle(self.save_dir / "best_trials.pkl") + + logger.info("==================== Best Trials ====================") + selected_columns = ["number", "value", "state"] + logger.info("\n" + best_trial_df[selected_columns].rename(columns={"value": "avg_acc"}).to_markdown()) + # save study object as pkl + with open(self.save_dir / "study.pkl", "wb") as f: + joblib.dump(study, f) + + def layer_wise_search(self, trial: optuna.Trial): + # copy it from cpu + model = copy.deepcopy(self.model) + layers = [] + + num_matmuls_per_layer = {"opt": 8, "llama": 9} + + if self.search_config["setup"]["granularity"] == "matmult": + rng = range(self.model.config.num_hidden_layers * num_matmuls_per_layer[self.model_arch]) + elif self.search_config["setup"]["granularity"] == "transformer_layer": + rng = range(self.model.config.num_hidden_layers) + + for i in rng: + v = trial.suggest_float(f"layer_{i}", 0, 1) + layer = ( + 1 if v < self.ratio else 0 + ) # 1 means quantized, 0 means not quantized, and the ratio means quantized ratio + layers.append(layer) + print(layers) + model, config, flops = quantize_transformer(self.model_arch, model, self.search_config, layers, self.seq_len) + + # convert to fp16 + model = model.to(dtype=torch.float16) + + if hasattr(model, "tie_weights"): + model.tie_weights() + model = my_dispatch_model(model, self.device, self.device_map) + task_accs = self.eval_task(model, self.batch_size) + cost = get_cost(self.model_arch, model, flops, dict(config.quant_config)) + del model + gc.collect() + + avg_acc = [task_accs["results"][t]["acc"] for t in self.tasks] + avg_acc = sum(avg_acc) / len(avg_acc) + # this is a simple test, maybe we should look at delta acc and delta resource + saved_cost_ratio = (self.base_bit_ops - cost) / self.base_bit_ops # the larger the better + logger.info( + f"Trial {trial.number} | avg_acc: {avg_acc:.4f} | saved_cost_ratio: {saved_cost_ratio:.4f} | layers: {layers}" + ) + self.logger.info(f"{trial.number},{avg_acc},{saved_cost_ratio},{cost},{layers}") + if self.enable_wandb: + wandb.log({"avg_acc": avg_acc, "saved_cost": saved_cost_ratio}) + # trial.set_user_attr("accuracy", {task: task_accs["results"][task]["acc"] for task in self.tasks}) + # trial.set_user_attr("saved_cost", saved_cost_ratio) + return avg_acc diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/search/search.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/search/search.py new file mode 100644 index 0000000000000000000000000000000000000000..20f15d7dada153ace0963da0e09626939aa56482 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/search/search.py @@ -0,0 +1,525 @@ +import logging +import gc +from pathlib import Path +import json +import ast +from pprint import pformat +import time + + +import torch +import joblib + +import transformers +import optuna +from optuna.pruners import BasePruner +import wandb +from accelerate import ( + infer_auto_device_map, + dispatch_model, +) + +import optuna +import copy +from contextlib import contextmanager, redirect_stdout, redirect_stderr +from os import devnull + + +from lm_eval.evaluator import simple_evaluate, TaskManager +from lm_eval.models.huggingface import HFLM + + +from ..models import quantize_transformer, estimate_cost +from ..utils.config_load import load_config + +from ..custom_tasks.open_alpaca import alpaca_evaluator, calculate_alpaca_ppl +from ..custom_tasks.slim_pajama import pajama_evaluator, calculate_pajama_ppl + +from .. import SCALING_LAW_SRC + +logger = logging.getLogger(__name__) +logger.propagate = False + +CUSTOM_HARNESS_TASK_DIR = SCALING_LAW_SRC.parent.joinpath("custom_harness_tasks") + +task_manager = TaskManager( + include_path=CUSTOM_HARNESS_TASK_DIR.as_posix(), + include_defaults=True, +) + + +@contextmanager +def silent_harness_evaluator(): + with open(devnull, "w") as fnull: + with redirect_stdout(fnull) as out: + yield out + + +def create_device_map(model, device_map) -> dict[str, int]: + if device_map == "auto": + device_map = infer_auto_device_map( + model, + no_split_module_classes=model._no_split_modules, + ) + elif device_map == "auto-balanced": + max_memory = {i: torch.cuda.mem_get_info(i)[0] // 2 for i in range(torch.cuda.device_count())} + device_map = infer_auto_device_map( + model, + no_split_module_classes=model._no_split_modules, + max_memory=max_memory, + ) + n_devices = torch.cuda.device_count() + n_decoder_layers = model.config.num_hidden_layers + n_layers_per_device = n_decoder_layers // n_devices + balanced_device_map = {} + current_device = 0 + current_decoder_idx = 0 + + for layer_name in device_map: + if ".layers." in layer_name: + if (current_decoder_idx + 1) % n_layers_per_device == 0: + current_device += 1 + current_decoder_idx += 1 + balanced_device_map[layer_name] = min(current_device, n_devices - 1) + device_map = balanced_device_map + else: + assert isinstance(device_map, dict) + return device_map + + +class DuplicateIterationPruner(BasePruner): + """ + DuplicatePruner + + Pruner to detect duplicate trials based on the integer list of trial.user_attrs["layers"]. + + This pruner is used to identify and prune trials that have the same set of parameters + as a previously completed trial. + """ + + def prune(self, study: "optuna.study.Study", trial: "optuna.trial.FrozenTrial") -> bool: + completed_trials = study.get_trials(states=[optuna.trial.TrialState.COMPLETE]) + + # construct a list of layers from the trial params + trial_layers = [] + # I hope the params are ordered by the order they were added, otherwise this will break the list assumption + for i in range(trial.params["num_hidden_layers"]): + v = trial.params[f"layer_{i}"] + layer = 1 if v < trial.params["ratio"] else 0 + trial_layers.append(layer) + + for completed_trial in completed_trials: + # reconstruct the layers list from the trial params + completed_trial_layers = [] + for i in range(completed_trial.params["num_hidden_layers"]): + v = completed_trial.params[f"layer_{i}"] + layer = 1 if v < completed_trial.params["ratio"] else 0 + completed_trial_layers.append(layer) + + # compare the two constructed layers lists to see if they are the same + if completed_trial_layers == trial_layers: + return True + + return False + + +class SearchBase: + def __init__( + self, + model_arch: str, + model_name: str, + search_config: dict, + save_dir: str, + device: str, + model_parallel: bool, + enable_wandb: bool, + local_hf_cache: str, + ): + self.model_name = model_name + self.local_hf_cache = local_hf_cache + self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) + self.model_parallel = model_parallel + self.device = device + + self.enable_wandb = enable_wandb + + self.search_config = search_config if isinstance(search_config, dict) else load_config(search_config) + self.save_dir = Path(save_dir) + + self._create_logger() + + def _create_logger(self): + self.save_dir.mkdir(parents=True, exist_ok=True) + logger = logging.getLogger("search_csv_logger") + logger.setLevel(logging.INFO) + fh = logging.FileHandler(self.save_dir / "search_log.csv") + fh.setLevel(logging.INFO) + logger.addHandler(fh) + self.search_csv_logger = logger + self.search_csv_logger.info("trial_number,avg_metric,saved_cost_ratio,cost,layers") + + logger = logging.getLogger("eval_csv_logger") + logger.setLevel(logging.INFO) + fh = logging.FileHandler(self.save_dir / "eval_log.csv") + fh.setLevel(logging.INFO) + logger.addHandler(fh) + self.eval_csv_logger = logger + self.eval_csv_logger.info("search_trail_number,eval_metric,trail_metric") + + def load_model(self): + config = transformers.AutoConfig.from_pretrained( + self.model_name, torch_dtype=torch.bfloat16, _attn_implementation="eager" + ) + other_kwargs = {} + if config.max_position_embeddings > 2048: + config.max_position_embeddings = 2048 + + if self.local_hf_cache: + model = transformers.AutoModelForCausalLM.from_pretrained( + self.model_name, config=config, local_files_only=True, cache_dir=self.local_hf_cache, **other_kwargs + ) + else: + model = transformers.AutoModelForCausalLM.from_pretrained(self.model_name, config=config, **other_kwargs) + + if self.model_parallel: + if hasattr(model, "tie_weights"): + model.tie_weights() + + if "device_map" not in self.search_config["setup"]: + device_map = create_device_map(model, "auto-balanced") + else: + _device_map = self.search_config["setup"]["device_map"] + if isinstance(_device_map, str) and _device_map in ["auto", "auto-balanced"]: + device_map = create_device_map(model, _device_map) + else: + device_map = ast.literal_eval(_device_map) + + self.device_map = device_map + else: + self.device_map = None + + return model + + +def calculate_sw_metric(result_dict): + avg_acc = [] + avg_ppl = [] + for task in result_dict["results"]: + if "acc,none" in result_dict["results"][task]: + acc_entry = "acc,none" + avg_acc.append(result_dict["results"][task][acc_entry]) + elif "word_perplexity,none" in result_dict["results"][task]: + ppl_entry = "word_perplexity,none" + # *: -1 to maximize the perplexity + avg_ppl.append(-result_dict["results"][task][ppl_entry]) + else: + raise ValueError(f"Failed to find 'acc' entry in {task} result:\n{result_dict['results'][task]}") + + if avg_acc: + avg_ = sum(avg_acc) / len(avg_acc) + elif avg_ppl: + avg_ = sum(avg_ppl) / len(avg_ppl) + + return avg_ + + +def calculate_mmlu_metric(result_dict): + # special handling for mmlu metric + avg_mmlu = result_dict["results"]["mmlu"]["acc,none"] + mmlu_stem = result_dict["results"]["mmlu_stem"]["acc,none"] + mmlu_humanities = result_dict["results"]["mmlu_humanities"]["acc,none"] + mmlu_social_science = result_dict["results"]["mmlu_social_sciences"]["acc,none"] + mmlu_other = result_dict["results"]["mmlu_other"]["acc,none"] + + return avg_mmlu, mmlu_stem, mmlu_humanities, mmlu_social_science, mmlu_other + + +class SearchRunner(SearchBase): + def __init__( + self, + model_arch: str, + model_name: str, + seq_len: int, + search_config: dict, + save_dir: str, + device: str, + model_parallel: bool, + enable_wandb: bool, + local_hf_cache: str, + ): + super().__init__( + model_arch, model_name, search_config, save_dir, device, model_parallel, enable_wandb, local_hf_cache + ) + + self.tasks = self.search_config["setup"]["tasks"] + self.batch_size = self.search_config["setup"]["batch_size"] + self.limit = self.search_config["setup"]["num_samples_per_trial"] + self.num_trials = self.search_config["setup"]["num_trials"] + self.ratio = self.search_config["setup"]["ratio"] + self.randomnize_search = self.search_config["setup"]["random"] + self.seq_len = seq_len + self.metric_history = [] + self.quantized_op_list_history = [] + + bf16_model = self.load_model() + self.bf16_model = bf16_model + initial_evaluate_model = copy.deepcopy(bf16_model) + + # move the model to the device + if self.device_map is not None: + initial_evaluate_model = dispatch_model(initial_evaluate_model, self.device_map) + else: + initial_evaluate_model = initial_evaluate_model.to(self.device) + + bf16_metric_dict = self.eval_tasks(initial_evaluate_model) + bf16_cost, bf16_cost_raw = estimate_cost(initial_evaluate_model, None, seq_len) + del initial_evaluate_model + gc.collect() + + self.bf16_cost = bf16_cost + # currently the custom tasks and the default lm-eval harness are mutually exclusive + # this is a limitation of the current implementation + if "custom_alpaca" in self.tasks: + bf16_avg_metric = calculate_alpaca_ppl(bf16_metric_dict) + elif "custom_pajama" in self.tasks: + bf16_avg_metric = calculate_pajama_ppl(bf16_metric_dict) + else: + bf16_avg_metric = calculate_sw_metric(bf16_metric_dict) + + logger.info("========== BF16 Evaluation ==========") + logger.info(f"Average Metric: {bf16_avg_metric}") + logger.info(f"Cost: {bf16_cost}") + logger.info(f"Raw metric:\n{pformat(bf16_metric_dict['results'], sort_dicts=False)}") + # add additional logging to the csv for raw model cost + self.search_csv_logger.info(f"fp16,{bf16_avg_metric},0,{bf16_cost}") + + with open(self.save_dir / "bf16_baseline_metric.json", "w") as f: + json.dump(bf16_metric_dict["results"], f) + with open(self.save_dir / "bf16_baseline_cost.json", "w") as f: + json.dump(bf16_cost_raw, f) + + def eval_tasks( + self, + model, + ): + # currently the custom tasks and the default lm-eval harness are mutually exclusive + # this is a limitation of the current implementation + if "custom_alpaca" in self.tasks: + logger.info("Calculating using custom open alpaca metric") + # calculate and formulate result with custom alpaca evaluator + # note that the metric returs is ppl but not acc + results = alpaca_evaluator( + model=model, tokenizer=self.tokenizer, batch_size=self.batch_size, limit=self.limit + ) + elif "custom_pajama" in self.tasks: + logger.info("Calculating using custom slim pajima metric") + # calculate and formulate result with custom pajima evaluator + # note that the metric returs is ppl but not acc + results = pajama_evaluator( + model=model, tokenizer=self.tokenizer, batch_size=self.batch_size, limit=self.limit + ) + else: + logger.info("Calculating using default lm-eval harness") + with silent_harness_evaluator(): + results = simple_evaluate( + model=HFLM(model, tokenizer=self.tokenizer), + tasks=self.tasks, + batch_size=self.batch_size, + limit=self.limit, + task_manager=task_manager, + ) + return results + + def search(self): + if self.randomnize_search: + sampler = optuna.samplers.RandomSampler() + else: + sampler = optuna.samplers.TPESampler() + + study = optuna.create_study( + direction="maximize", + pruner=DuplicateIterationPruner(), + sampler=sampler, + ) + + study.optimize(self.search_objective, n_trials=self.num_trials, show_progress_bar=True) + + save_path = self.save_dir / "study.pkl" + joblib.dump(study, save_path) + + def search_objective(self, trial: optuna.Trial): + q_model = copy.deepcopy(self.bf16_model) + if hasattr(q_model, "tie_weights"): + q_model.tie_weights() + + if q_model.config.architectures[0] == "LlamaForCausalLM": + num_gemms_per_layer = 9 + elif q_model.config.architectures[0] == "OPTForCausalLM": + num_gemms_per_layer = 8 + elif q_model.config.architectures[0] == "Qwen2ForCausalLM": + num_gemms_per_layer = 9 + elif q_model.config.architectures[0] == "PhiForCausalLM": + num_gemms_per_layer = 8 + elif q_model.config.architectures[0] == "Phi3ForCausalLM": + num_gemms_per_layer = 6 + elif q_model.config.architectures[0] == "Gemma2ForCausalLM": + num_gemms_per_layer = 9 + else: + raise ValueError(f"Unknown model architecture {q_model.config.architectures[0]}") + + granularity = self.search_config["setup"]["granularity"] + + if granularity == "matmult": + op_range = range(self.bf16_model.config.num_hidden_layers * num_gemms_per_layer) + _granularity = "gemm" + elif granularity == "transformer_layer": + op_range = range(self.bf16_model.config.num_hidden_layers) + _granularity = "decoder_layer" + else: + raise ValueError(f"Unknown granularity {granularity}") + + is_quantized = [] + + for i in op_range: + v = trial.suggest_float(f"layer_{i}", 0, 1) + is_quantized.append(1 if v < self.ratio else 0) + + op_ids = [] + for i, is_q in enumerate(is_quantized): + if is_q: + op_ids.append(i) + + q_config_raw = self.search_config["quantization"] + q_model, q_config_full = quantize_transformer( + q_model, q_config=q_config_raw, op_ids=op_ids, granularity=_granularity + ) + if self.device_map is not None: + q_model = dispatch_model(q_model, self.device_map) + else: + q_model = q_model.to(self.device) + # downstream tasks evaluation, + # currently the custom tasks and the default lm-eval harness are mutually exclusive + task_metric_dict = self.eval_tasks(q_model) + if "custom_alpaca" in self.tasks: + avg_metric = calculate_alpaca_ppl(task_metric_dict) + elif "custom_pajama" in self.tasks: + avg_metric = calculate_pajama_ppl(task_metric_dict) + else: + avg_metric = calculate_sw_metric(task_metric_dict) + + cost, raw_cost = estimate_cost(q_model, q_config_full, self.seq_len) + + del q_model + gc.collect() + torch.cuda.empty_cache() + # sleep for 30 seconds to allow the GPU to totally evict the model + time.sleep(30) + + saved_cost_ratio = (self.bf16_cost - cost) / self.bf16_cost + self.search_csv_logger.info(f"{trial.number},{avg_metric},{saved_cost_ratio},{cost},{is_quantized}") + logger.info( + f"Trial {trial.number} | Avg Metric: {avg_metric} | Saved Cost Ratio: {saved_cost_ratio: .4f} | Layers: {is_quantized}" + ) + + # record search history + self.metric_history.append(avg_metric) + self.quantized_op_list_history.append(is_quantized) + + if self.enable_wandb: + wandb.log(dict(avg_metric=avg_metric, saved_cost_ratio=saved_cost_ratio)) + + return avg_metric + + def evaluate(self): + config = self.search_config["evaluation"] + tasks = config["tasks"] + batch_size = config["batch_size"] + best_n_epoch = config["num_best_epochs"] + + granularity = self.search_config["setup"]["granularity"] + if granularity == "matmult": + granularity = "gemm" + elif granularity == "transformer_layer": + granularity = "decoder_layer" + else: + raise ValueError(f"Unknown granularity {granularity}") + + logger.info("evaluate best {} epoch".format(best_n_epoch)) + + eval_results = [] + avg_metrics = [] + + # choose best n epoch from the search history + best_n_idx = sorted(range(len(self.metric_history)), key=lambda i: self.metric_history[i], reverse=True)[ + :best_n_epoch + ] + for best_idx in best_n_idx: + logger.info("evaluate best {}/{} epoch".format(best_idx, best_n_epoch)) + q_model = copy.deepcopy(self.bf16_model) + if self.device_map is not None: + q_model = dispatch_model(q_model, self.device_map) + else: + q_model = q_model.to(self.device) + + quantized_op_list = self.quantized_op_list_history[best_idx] + # construct op_ids for quantization + op_ids = [] + for i, is_q in enumerate(quantized_op_list): + if is_q: + op_ids.append(i) + + q_model, q_config_full = quantize_transformer( + q_model, q_config=self.search_config["quantization"], op_ids=op_ids, granularity=granularity + ) + + if self.device_map is not None: + q_model = dispatch_model(q_model, self.device_map) + else: + q_model = q_model.to(self.device) + + with silent_harness_evaluator(): + results = simple_evaluate( + model=HFLM(q_model, tokenizer=self.tokenizer), + tasks=tasks, + batch_size=batch_size, + task_manager=task_manager, + ) + + if tasks == "mmlu": + # special handling for mmlu metric as its a group of metrics + metric = calculate_mmlu_metric(results) + avg_metric = metric[0] + else: + metric = calculate_sw_metric(results) + avg_metric = metric + + eval_results.append(metric) + avg_metrics.append(avg_metric) + + self.eval_csv_logger.info(f"{best_idx},{avg_metric},{avg_metric}") + + del q_model + gc.collect() + torch.cuda.empty_cache() + # sleep for 30 seconds to allow the GPU to totally evict the model + time.sleep(30) + + best_avg_metric_idx = avg_metrics.index(max(avg_metrics)) + best_trial_idx = best_n_idx[best_avg_metric_idx] + best_eval_metric = avg_metrics[best_avg_metric_idx] + best_search_metric = self.metric_history[best_trial_idx] + full_eval_result = {"best_n_idx": best_n_idx, "eval_results": eval_results} + + eval_save_path = self.save_dir / "full_eval_result.pkl" + joblib.dump(full_eval_result, eval_save_path) + + return_dict = best_eval_metric, best_search_metric, best_trial_idx, full_eval_result + + eval_result_dict = { + "best_eval_metric": best_eval_metric, + "best_search_metric": best_search_metric, + "best_trial_idx": best_trial_idx, + } + with open(self.save_dir / "eval_result.json", "w") as f: + json.dump(eval_result_dict, f) + return return_dict diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/search/search_conditional.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/search/search_conditional.py new file mode 100644 index 0000000000000000000000000000000000000000..f615949f96040c6274e18b39e6f21e02b1b405fd --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/search/search_conditional.py @@ -0,0 +1,1280 @@ +import ast +import json +import logging +from functools import partial +from pathlib import Path +from pprint import pformat + +import datasets +import joblib +import optuna +import pandas as pd +import transformers +from accelerate import (infer_auto_device_map, init_empty_weights, + load_checkpoint_and_dispatch) +from tabulate import tabulate + +from ..eval import eval_cls_glue as evaluate_cls_task +from ..eval import eval_dse_results, eval_prompting_tasks +from ..models import (get_config_cls, get_model_cls, get_model_profiler, + get_quant_config_parser, get_quant_config_sampler, + get_stat_config_formatter, get_tokenizer_cls) +from ..models.quantize import transform_stat_profile_to_int_quant_config +from ..utils import flatten_dict, load_config, save_config + +optuna.logging.set_verbosity(optuna.logging.ERROR) + +logger = logging.getLogger(__name__) + + +class SearchBase: + def __init__( + self, + model_arch: str, + model_name: str, + task: str, + search_config: dict | str, + save_dir: str, + device: str, + model_parallel: bool = False, + ) -> None: + self.model_arch = model_arch + self.model_name = model_name + self.model_cls = get_model_cls(model_arch, task) + self.config_cls = get_config_cls(model_arch) + self.tokenizer = get_tokenizer_cls(model_arch).from_pretrained(model_name) + self.model_config = self.config_cls.from_pretrained(model_name) + self.device = device + self.model_parallel = model_parallel + + self.search_config = ( + search_config + if isinstance(search_config, dict) + else load_config(search_config) + ) + self.save_dir = Path(save_dir) + self._create_logger() + + def _create_logger(self): + self.save_dir.mkdir(parents=True, exist_ok=True) + logger = logging.getLogger(type(self).__name__) + logger.setLevel(logging.INFO) + fh = logging.FileHandler(self.save_dir / "search_log.csv") + fh.setLevel(logging.INFO) + logger.addHandler(fh) + self.logger = logger + + def rebuild_model(self, quant_config): + if quant_config is None: + config = self.config_cls.from_pretrained(self.model_name) + else: + config = self.config_cls.from_pretrained( + self.model_name, quant_config=quant_config + ) + if "cuda" in self.device: + if self.model_parallel: + with init_empty_weights(): + model = self.model_cls(config) + device_map = infer_auto_device_map( + model, + no_split_module_classes=model._no_split_modules, + ) + model = load_checkpoint_and_dispatch( + model, checkpoint=self.model_name, device_map=device_map + ) + else: + model = self.model_cls.from_pretrained( + self.model_name, config=config + ).to(self.device) + elif self.device == "cpu": + model = self.model_cls.from_pretrained(self.model_name, config=config) + else: + raise ValueError(f"Unknown device: {self.device}") + return model + + +class SearchIntQuantisationForClassification(SearchBase): + """ + Perform quantisation search for bert-like models on classification tasks + - Bert-like model refers to a network consisting of the base model and a classifier head, already fine-tuned on downstream tasked. + - This class calls a evaluation function to get the results on glue tasks + """ + + def __init__( + self, + model_arch: str, + model_name: str, + search_config: dict | str, + save_dir: str, + num_labels: int, + device: str, + model_parallel: bool = False, + ) -> None: + super().__init__( + model_arch, + model_name, + "cls", + search_config, + save_dir, + device, + model_parallel, + ) + self.q_bitwidth_profiler = get_model_profiler(model_arch) + self.q_config_parser = get_quant_config_parser(model_arch) + # TODO: use a general recursive quant config parser, which traverses dict to sample leaf values (each leaf value is a list of choices) + self.q_config_sampler = get_quant_config_sampler(model_arch) + self.q_config_formatter = get_stat_config_formatter(model_arch) + self.num_labels = num_labels + + self._pre_search_check() + + def _pre_search_check(self): + if self.search_config["search_estimator"]["alpha_accuracy"] == 0: + assert ( + self.search_config["search_strategy"]["accuracy_threshold"] == 0 + ), "alpha_accuracy is 0, please set accuracy_threshold to 0 as well" + if self.search_config["search_estimator"]["alpha_memory_density"] == 0: + assert ( + self.search_config["search_strategy"]["avg_bitwidth_threshold"] == 0 + ), "alpha_memory_density is 0, please set avg_bitwidth_threshold to 0 as well" + if self.search_config["search_estimator"]["alpha_fps"] == 0: + assert ( + self.search_config["search_strategy"]["fps_threshold"] == 0 + ), "alpha_fps is 0, please set fps_threshold to 0 as well" + if self.search_config["search_estimator"]["fps_per_lut"] == 0: + assert ( + self.search_config["search_strategy"]["fps_threshold"] == 0 + ), "fps_per_lut is 0, please set fps_threshold to 0 as well" + + def rebuild_model(self, quant_config): + if quant_config is None: + config = self.config_cls.from_pretrained( + self.model_name, num_labels=self.num_labels + ) + else: + config = self.config_cls.from_pretrained( + self.model_name, quant_config=quant_config, num_labels=self.num_labels + ) + if "cuda" in self.device: + if self.model_parallel: + with init_empty_weights(): + model = self.model_cls(config) + device_map = infer_auto_device_map( + model, + no_split_module_classes=model._no_split_modules, + ) + model = load_checkpoint_and_dispatch( + model, checkpoint=self.model_name, device_map=device_map + ) + logger.debug("Model parallelism enabled") + else: + model = self.model_cls.from_pretrained( + self.model_name, config=config + ).to(self.device) + logger.debug(f"Running on single device: {self.device}") + elif self.device == "cpu": + model = self.model_cls.from_pretrained(self.model_name, config=config) + logger.debug("Running on CPU") + else: + raise ValueError(f"Unknown device: {self.device}") + return model + + def search( + self, + eval_dataloader, + task: str, + is_regression: bool, + seq_len: int, + stat_profiler: dict, + range_entry: str = "range_min_max", + num_samples_per_trial: int = 512, + ): + def compute_software_metric( + model, task, eval_dataloader, is_regression, num_samples + ) -> dict: + results = evaluate_cls_task( + model, + task, + eval_dataloader, + is_regression=is_regression, + num_samples=num_samples, + ) + match task: + case "sst2": + accuracy = results["accuracy"] + case _: + raise NotImplementedError(f"task {task} not implemented") + s_metric = { + "accuracy": accuracy, + } + return s_metric + + def compute_hardware_metric(profiler, config, seq_len, compare_to=32) -> dict: + results = profiler(config, seq_len) + num_params = results["num_params"] + num_acts = results["num_acts"] + param_bits = results["param_bits"] + act_bits = results["act_bits"] + flops = results["flops"] + + param_bits_fp32 = compare_to * num_params + act_bits_fp32 = compare_to * num_acts + + mem_density = (param_bits_fp32 + act_bits_fp32) / (param_bits + act_bits) + h_metric = { + "memory_density": mem_density, + } + + results = eval_dse_results(config, is_mixed=True) + h_metric.update( + { + "fps": results["best_fps"], + "fps_per_lut": results["best_fps"] / results["resource"], + } + ) + return h_metric + + def objective( + trial: optuna.Trial, + # quant_config_sampler, + quant_config_seed, + seq_len: int, + stat_profile: dict, + range_entry: str, + file_logger: logging.Logger, + num_samples: int, + ): + """ + seed -> sampled_config -> extend sampled to full config -> determine frac_width -> evaluate + """ + if self.search_config["search_space"]["extend_quant_config_seed_first"]: + quant_config_seed = self.q_config_parser( + quant_config_seed, self.model_config.num_hidden_layers, strict=False + ) + + # TODO: create a general recursive quant config parser + sampled_config = self.q_config_sampler( + trial=trial, + name="root", + config_seed=quant_config_seed, + ) + + sampled_config = self.q_config_parser( + sampled_config, self.model_config.num_hidden_layers, strict=False + ) + flattened_sampled_config = {} + flatten_dict(sampled_config, new_d=flattened_sampled_config) + + sampled_config_complete = sampled_config + sampled_config = transform_stat_profile_to_int_quant_config( + stat_profile, + range_entry=range_entry, + width=flattened_sampled_config, + frac_choices=None, + root_name="root", + is_ptq=True, + bypass=False, + ) + + self.q_config_formatter( + sampled_config, + self.model_config.num_hidden_layers, + default_config=sampled_config_complete, + is_ptq=True, + bypass=False, + ) + + model = self.rebuild_model(sampled_config) + + s_metric = compute_software_metric( + model=model, + task=task, + eval_dataloader=eval_dataloader, + is_regression=is_regression, + num_samples=num_samples, + ) + h_metric = compute_hardware_metric( + self.q_bitwidth_profiler, + model.config, + seq_len=seq_len, + compare_to=self.search_config["search_estimator"]["compare_to"], + ) + metric_name_list = list(s_metric.keys()) + list(h_metric.keys()) + scaled_metric_list = [] + metric_list = list(s_metric.values()) + list(h_metric.values()) + + # accuracy + for metric_name, metric in s_metric.items(): + scaled_metric_list.append( + metric + * self.search_config["search_estimator"][f"alpha_{metric_name}"] + ) + # memory density, fps + for metric_name, metric in h_metric.items(): + scaled_metric_list.append( + metric + * self.search_config["search_estimator"][f"alpha_{metric_name}"] + ) + + if trial.number == 0: + file_logger.info( + f"trial_id," + + ",".join(metric_name_list) + + "," + + ",".join(map(lambda x: f"scaled_{x}", metric_name_list)) + ) + + file_logger.info( + f"{trial.number}," + + ",".join(map(str, metric_list)) + + "," + + ",".join(map(str, scaled_metric_list)) + ) + + return (*scaled_metric_list,) + + def logger_callback( + study: optuna.Study, frozen_trail: optuna.trial.FrozenTrial + ): + acc, mem_density, fps, fps_per_lut = frozen_trail.values + # fmt: off + ori_acc = acc / (self.search_config["search_estimator"]["alpha_accuracy"] + 1e-8) + ori_mem_density = mem_density / (self.search_config["search_estimator"]["alpha_memory_density"] + 1e-8) + ori_fps = fps / (self.search_config["search_estimator"]["alpha_fps"] + 1e-8) + ori_fps_per_lut = fps_per_lut / (self.search_config["search_estimator"]["alpha_fps"] + 1e-8) + + avg_bitwidth = self.search_config["search_estimator"]["compare_to"] / ori_mem_density + # fmt: on + logger.info( + f"Trial {frozen_trail.number} is done: " + f"unscaled (accuracy, mem_density, fps, fps_per_lut) = " + f"({ori_acc:.4f}, {ori_mem_density:.2f}, {ori_fps:.2f}, {ori_fps_per_lut:.2e}), " + f"scaled (...) = " + f"({acc:.4f}, {mem_density:.2f}, {fps:.2f}, {fps_per_lut:.2f}), " + f"avg_bitwidth = {avg_bitwidth:.1f}" + ) + + # create sampler and study + match self.search_config["search_strategy"]["sampler"].lower(): + case "random": + sampler = optuna.samplers.RandomSampler() + case "tpe": + sampler = optuna.samplers.TPESampler() + case "nsgaii": + sampler = optuna.samplers.NSGAIISampler() + case "nsgaiii": + sampler = optuna.samplers.NSGAIIISampler() + case "qmc": + sampler = optuna.samplers.QMCSampler() + case _: + raise ValueError( + f"Unknown sampler name: {self.search_config['search_strategy']['sampler']}" + ) + logger.info(f"Using sampler: {sampler.__class__.__name__}") + study = optuna.create_study( + directions=["maximize", "maximize", "maximize", "maximize"], + sampler=sampler, + ) + + # sample configs + q_config_seed = self.search_config["search_space"]["quant_config_seed"] + + study.optimize( + func=partial( + objective, + quant_config_seed=q_config_seed, + seq_len=seq_len, + stat_profile=stat_profiler, + range_entry=range_entry, + file_logger=self.logger, + num_samples=num_samples_per_trial, + ), + n_trials=self.search_config["search_strategy"]["n_trials"], + n_jobs=self.search_config["search_strategy"]["n_jobs"], + timeout=self.search_config["search_strategy"].get("timeout", None), + show_progress_bar=True, + callbacks=[logger_callback], + ) + + self.save_study_and_results(study, stat_profiler, range_entry) + return study + + @staticmethod + def save_trial_to_quant_config( + trial: optuna.trial.FrozenTrial, + q_config_parser: callable, + q_config_formatter: callable, + num_hidden_layers: int, + stat_profile: dict, + range_entry: str, + save_path: str = None, + ): + def parse_and_create_item(quant_config: dict, keys: list[str], value): + for i, key in enumerate(keys): + if key not in quant_config: + quant_config[key] = {} + if i == len(keys) - 1: + quant_config[key] = value + else: + quant_config = quant_config[key] + + params = trial.params + + sampled_config = {} + for name, value in params.items(): + keys = name.removeprefix("root:").split(":") + if isinstance(value, str) and value.startswith("!ast!"): + value = ast.literal_eval(value.removeprefix("!ast!")) + parse_and_create_item(sampled_config, keys, value) + # here we got sampled_config = self.q_config_sampler in self.search + + sampled_config = q_config_parser( + sampled_config, num_hidden_layers, strict=False + ) + flattened_sampled_config = {} + flatten_dict(sampled_config, new_d=flattened_sampled_config) + + sampled_config_complete = sampled_config + sampled_config = transform_stat_profile_to_int_quant_config( + stat_profile, + range_entry=range_entry, + width=flattened_sampled_config, + frac_choices=None, + root_name="root", + is_ptq=True, + bypass=False, + ) + q_config_formatter( + sampled_config, + num_hidden_layers, + default_config=sampled_config_complete, + is_ptq=True, + bypass=False, + ) + + if save_path is not None: + save_config(sampled_config, save_path) + return sampled_config + + @staticmethod + def get_result_df( + study: optuna.Study, + q_config_parser: callable, + q_config_formatter: callable, + num_hidden_layers: int, + stat_profile: dict, + range_entry: str, + save_dir, + alpha_acc: float, + alpha_mem_density: float, + alpha_fps: float, + alpha_fps_per_lut: float, + compare_to: int, + ) -> pd.DataFrame: + result_df = pd.DataFrame( + columns=[ + "trial_id", + "accuracy", + "memory_density", + "fps", + "fps_per_lut", + "scaled_accuracy", + "scaled_memory_density", + "scaled_fps", + "scaled_fps_per_lut", + "quant_config_path", + "avg_bitwidth", + "quant_config", + "datetime_start", + "datetime_end", + ] + ) + quant_config_dir = save_dir / "quant_configs" + quant_config_dir.mkdir(parents=True, exist_ok=True) + for i, trial in enumerate(study.best_trials): + trial_id = trial.number + quant_config_path = quant_config_dir / f"quant_config_{i}.toml" + quant_config = ( + SearchIntQuantisationForClassification.save_trial_to_quant_config( + trial, + q_config_parser=q_config_parser, + q_config_formatter=q_config_formatter, + num_hidden_layers=num_hidden_layers, + stat_profile=stat_profile, + range_entry=range_entry, + save_path=quant_config_path, + ) + ) + ( + scaled_acc, + scaled_mem_density, + scaled_fps, + scaled_fps_per_lut, + ) = trial.values + acc = scaled_acc / (alpha_acc + 1e-8) + mem_density = scaled_mem_density / (alpha_mem_density + 1e-8) + fps = scaled_fps / (alpha_fps + 1e-8) + fps_per_lut = scaled_fps_per_lut / (alpha_fps + 1e-8) + avg_bitwidth = compare_to / mem_density + result_df.loc[i] = [ + trial_id, + acc, + mem_density, + fps, + fps_per_lut, + scaled_acc, + scaled_mem_density, + scaled_fps, + scaled_fps_per_lut, + quant_config_path, + avg_bitwidth, + quant_config, + trial.datetime_start.strftime("%Y-%m-%d %H:%M:%S"), + trial.datetime_complete.strftime("%Y-%m-%d %H:%M:%S"), + ] + result_df = result_df.sort_values(by="accuracy", ascending=False) + return result_df + + def save_study_and_results(self, study: optuna.Study, stat_profile, range_entry): + save_dir = Path(self.save_dir) + save_dir.mkdir(parents=True, exist_ok=True) + study_path = save_dir / "study.pkl" + result_table_path = save_dir / "results.csv" + search_config_path = save_dir / "search_config.toml" + save_config(self.search_config, search_config_path) + + # fmt: off + result_df = SearchIntQuantisationForClassification.get_result_df( + study, + q_config_parser=self.q_config_parser, + q_config_formatter=self.q_config_formatter, + num_hidden_layers=self.model_config.num_hidden_layers, + stat_profile=stat_profile, + range_entry=range_entry, + save_dir=save_dir, + alpha_acc=self.search_config["search_estimator"]["alpha_accuracy"], + alpha_mem_density=self.search_config["search_estimator"]["alpha_memory_density"], + alpha_fps=self.search_config["search_estimator"]["alpha_fps"], + alpha_fps_per_lut=self.search_config["search_estimator"]["alpha_fps_per_lut"], + compare_to=self.search_config["search_estimator"]["compare_to"], + ) + # fmt: on + + result_df.drop("quant_config", axis=1).to_csv(result_table_path, index=False) + joblib.dump(study, study_path) + logger.info("========== Best Trials ==========") + logger.info( + f"(alpha_accuracy, alpha_memory_density, alpha_fps) = " + f"{self.search_config['search_estimator']['alpha_accuracy']}, " + f"{self.search_config['search_estimator']['alpha_memory_density']}, " + f"{self.search_config['search_estimator']['alpha_fps']}, " + f"{self.search_config['search_estimator']['alpha_fps_per_lut']}, " + ) + + result_df = result_df.drop("quant_config", axis=1) + result_df["quant_config_name"] = result_df["quant_config_path"].apply( + lambda x: "$save_dir/quant_configs/" + str(Path(x).name) + ) + result_df = result_df.applymap( + lambda x: f"{x:.4f}" if isinstance(x, float) else x + ) + result_df = result_df.drop("quant_config_path", axis=1) + logger.info( + "\n" + + tabulate( + result_df, + headers="keys", + tablefmt="pretty", + ) + ) + logger.info(f"Results saved to {save_dir}") + logger.info(f"Study saved to {study_path}") + + def evaluate_best_trials( + self, + study: optuna.Study, + stat_profile: dict, + range_entry: str, + eval_dataloader, + task, + is_regression, + ): + # fmt: off + acc_threshold = self.search_config["search_strategy"]["accuracy_threshold"] + avg_bitwidth_threshold = self.search_config["search_strategy"]["avg_bitwidth_threshold"] + fps_threshold = self.search_config["search_strategy"]["fps_threshold"] + fps_per_lut_threshold = self.search_config["search_strategy"]["fps_per_lut_threshold"] + # fmt: on + sort_by = self.search_config["search_strategy"]["sort_by"] + + for i, s in enumerate(sort_by): + assert s in [ + "accuracy", + "avg_bitwidth", + "fps", + "fps_per_lut", + ], f"Unknown sort_by: {s}, must be one of ['accuracy', 'avg_bitwidth', 'fps', 'fps_per_lut']" + + # fmt: off + result_df = SearchIntQuantisationForClassification.get_result_df( + study, + q_config_parser=self.q_config_parser, + q_config_formatter=self.q_config_formatter, + num_hidden_layers=self.model_config.num_hidden_layers, + stat_profile=stat_profile, + range_entry=range_entry, + save_dir=self.save_dir, + alpha_acc=self.search_config["search_estimator"]["alpha_accuracy"], + alpha_mem_density=self.search_config["search_estimator"]["alpha_memory_density"], + alpha_fps=self.search_config["search_estimator"]["alpha_fps"], + alpha_fps_per_lut=self.search_config["search_estimator"]["alpha_fps_per_lut"], + compare_to=self.search_config["search_estimator"]["compare_to"], + ) + # fmt: on + + filtered_df = result_df.loc[result_df["accuracy"] >= acc_threshold] + filtered_df = filtered_df.loc[ + filtered_df["avg_bitwidth"] <= avg_bitwidth_threshold + ] + filtered_df = filtered_df.loc[filtered_df["fps"] >= fps_threshold] + filtered_df = filtered_df.loc[ + filtered_df["fps_per_lut"] >= fps_per_lut_threshold + ] + if len(filtered_df) == 0: + logger.warning( + f"No trials found with acc >= {acc_threshold}, avg_bitwidth <= {avg_bitwidth_threshold}, fps >= {fps_threshold}, fps_per_lut >= {fps_per_lut_threshold}" + ) + return + + ascending_mapping = { + "accuracy": False, + "avg_bitwidth": True, + "fps": False, + "fps_per_lut": False, + } + + filtered_df = filtered_df.sort_values( + sort_by, ascending=[ascending_mapping[s] for s in sort_by] + ) + + best_quant_config = filtered_df.iloc[0]["quant_config"] + save_config(best_quant_config, self.save_dir / "best_quant_config.toml") + + logger.info("========== Evaluating the Best ==========") + model = self.rebuild_model(best_quant_config) + results = evaluate_cls_task( + model, + task, + eval_dataloader, + is_regression=is_regression, + num_samples=None, + progress_bar=True, + ) + with open(self.save_dir / "best_eval.json", "w") as f: + json.dump(results, f, indent=4) + + logger.info( + f"Best quant config avg bitwidth: {filtered_df.iloc[0]['avg_bitwidth']: .2f}" + ) + logger.info( + f"Best quant config software metric: {pformat(results)}, saved to {self.save_dir / 'best_eval.json'})" + ) + + +class SearchIntQuantisationForPromptingCLS(SearchBase): + """ + Perform quantisation search for GPT-like models on downstream tasks + - GPT-like model refers to a network performing language modeling only (no classifier head). + - This class calls lm-eval-harness to get the results on downstream tasks. + """ + + def __init__( + self, + model_arch: str, + model_name: str, + search_config: dict | str, + save_dir: str, + ) -> None: + super().__init__( + model_arch, + model_name, + "lm", + search_config, + save_dir, + device=None, + model_parallel=False, + ) + self.q_bitwidth_profiler = get_model_profiler(model_arch) + self.q_config_parser = get_quant_config_parser(model_arch) + self.q_config_sampler = get_quant_config_sampler(model_arch) + self.q_config_formatter = get_stat_config_formatter(model_arch) + + self._pre_search_check() + + def _pre_search_check(self): + if self.search_config["search_estimator"]["alpha_accuracy"] == 0: + assert ( + self.search_config["search_strategy"]["accuracy_threshold"] == 0 + ), "alpha_accuracy is 0, please set accuracy_threshold to 0 as well" + if self.search_config["search_estimator"]["alpha_memory_density"] == 0: + assert ( + self.search_config["search_strategy"]["avg_bitwidth_threshold"] == 0 + ), "alpha_memory_density is 0, please set avg_bitwidth_threshold to 0 as well" + if self.search_config["search_estimator"]["alpha_fps"] == 0: + assert ( + self.search_config["search_strategy"]["fps_threshold"] == 0 + ), "alpha_fps is 0, please set fps_threshold to 0 as well" + if self.search_config["search_estimator"]["fps_per_lut"] == 0: + assert ( + self.search_config["search_strategy"]["fps_threshold"] == 0 + ), "fps_per_lut is 0, please set fps_threshold to 0 as well" + + def rebuild_model(self, quant_config): + raise NotImplementedError + + def rebuild_model_config(self, quant_config): + self.model_config = self.config_cls.from_pretrained( + self.model_name, quant_config=quant_config + ) + return self.model_config + + def search( + self, + tasks: list[str], + num_fewshot: int, + batch_size: int, + max_batch_size: int, + device: str, + num_samples_per_trial: int, + stat_profile: dict, + range_entry: str = "range_min_max", + profiler_seq_len: int = 256, + ): + def compute_software_metric( + model_arch, + model_name, + quant_config, + tasks, + num_fewshot, + batch_size, + max_batch_size, + device, + limit, + ) -> dict: + results = eval_prompting_tasks( + model_wrapper="llm-mixed-q", + model_arch=model_arch, + model_name=model_name, + quant_config=quant_config, + tasks=tasks, + num_fewshot=num_fewshot, + batch_size=batch_size, + max_batch_size=max_batch_size, + device=device, + limit=limit, + no_cache=True, + ) + results = results[ + "results" + ] # {"results": ..., "config": ..., version: ...} + if len(results) > 1: + logger.debug("software_metric_results: " + str(results)) + logger.warning( + f"More than one task results returned, simply averaging the accuracy if available" + ) + logger.debug("software_metric_results: " + str(results)) + acc_list = [] + for task_name, task_metric in results.items(): + if "acc" in task_metric: + acc_list.append(task_metric["acc"]) + else: + logger.warning(f"Task {task_name} does not have accuracy, skipping") + avg_acc = sum(acc_list) / len(acc_list) + s_metric = { + "accuracy": avg_acc, + } + + return s_metric + + def compute_hardware_metric(profiler, config, seq_len, compare_to=32): + results = profiler(config, seq_len) + num_params = results["num_params"] + num_acts = results["num_acts"] + param_bits = results["param_bits"] + act_bits = results["act_bits"] + flops = results["flops"] + + param_bits_fp32 = compare_to * num_params + act_bits_fp32 = compare_to * num_acts + + mem_density = (param_bits_fp32 + act_bits_fp32) / (param_bits + act_bits) + h_metric = { + "memory_density": mem_density, + } + + results = eval_dse_results(config, is_mixed=True) + h_metric.update( + { + "fps": results["best_fps"], + "fps_per_lut": results["best_fps"] / results["resource"], + } + ) + return h_metric + + def objective( + trial: optuna.Trial, + quant_config_seed, + seq_len: int, + tasks: list[str], + num_fewshot: int, + batch_size: int, + max_batch_size: int, + device: str, + limit: int, + stat_profile: dict, + range_entry: str, + file_logger: logging.Logger, + ): + if self.search_config["search_space"]["extend_quant_config_seed_first"]: + quant_config_seed = self.q_config_parser( + quant_config_seed, self.model_config.num_hidden_layers, strict=False + ) + + sampled_config = self.q_config_sampler( + trial=trial, + name="root", + config_seed=quant_config_seed, + ) + + sampled_config = self.q_config_parser( + sampled_config, self.model_config.num_hidden_layers, strict=False + ) + + flattened_sampled_config = {} + flatten_dict(sampled_config, new_d=flattened_sampled_config) + + sampled_config_complete = sampled_config + sampled_config = transform_stat_profile_to_int_quant_config( + stat_profile, + range_entry=range_entry, + width=flattened_sampled_config, + frac_choices=None, + root_name="root", + is_ptq=True, + bypass=False, + ) + + self.q_config_formatter( + sampled_config, + self.model_config.num_hidden_layers, + default_config=sampled_config_complete, + is_ptq=True, + bypass=False, + ) + s_metric = compute_software_metric( + model_arch=self.model_arch, + model_name=self.model_name, + quant_config=sampled_config, + tasks=tasks, + num_fewshot=num_fewshot, + batch_size=batch_size, + max_batch_size=max_batch_size, + device=device, + limit=limit, + ) + config = self.rebuild_model_config(sampled_config) + h_metric = compute_hardware_metric( + self.q_bitwidth_profiler, + config, + seq_len=seq_len, + compare_to=self.search_config["search_estimator"]["compare_to"], + ) + + metric_name_list = list(s_metric.keys()) + list(h_metric.keys()) + scaled_metric_list = [] + metric_list = list(s_metric.values()) + list(h_metric.values()) + + # accuracy + for metric_name, metric in s_metric.items(): + scaled_metric_list.append( + metric + * self.search_config["search_estimator"][f"alpha_{metric_name}"] + ) + # memory density, fps + for metric_name, metric in h_metric.items(): + scaled_metric_list.append( + metric + * self.search_config["search_estimator"][f"alpha_{metric_name}"] + ) + + if trial.number == 0: + file_logger.info( + f"trial_id," + + ",".join(metric_name_list) + + "," + + ",".join(map(lambda x: f"scaled_{x}", metric_name_list)) + ) + + file_logger.info( + f"{trial.number}," + + ",".join(map(str, metric_list)) + + "," + + ",".join(map(str, scaled_metric_list)) + ) + return (*scaled_metric_list,) + + def logger_callback( + study: optuna.Study, frozen_trail: optuna.trial.FrozenTrial + ): + acc, mem_density, fps, fps_per_lut = frozen_trail.values + # fmt: off + ori_acc = acc / (self.search_config["search_estimator"]["alpha_accuracy"] + 1e-8) + ori_mem_density = mem_density / (self.search_config["search_estimator"]["alpha_memory_density"] + 1e-8) + ori_fps = fps / (self.search_config["search_estimator"]["alpha_fps"] + 1e-8) + ori_fps_per_lut = fps_per_lut / (self.search_config["search_estimator"]["alpha_fps"] + 1e-8) + + avg_bitwidth = self.search_config["search_estimator"]["compare_to"] / ori_mem_density + # fmt: on + logger.info( + f"Trial {frozen_trail.number} is done: " + f"unscaled (accuracy, mem_density, fps, fps_per_lut) = " + f"({ori_acc:.4f}, {ori_mem_density:.2f}, {ori_fps:.2f}, {fps_per_lut:.2e}), " + f"scaled (...) = " + f"({acc:.4f}, {mem_density:.2f}, {fps:.2f}, {fps_per_lut:.2f}), " + f"avg_bitwidth = {avg_bitwidth:.1f}" + ) + + # create sampler and study + match self.search_config["search_strategy"]["sampler"].lower(): + case "random": + sampler = optuna.samplers.RandomSampler() + case "tpe": + sampler = optuna.samplers.TPESampler() + case "nsgaii": + sampler = optuna.samplers.NSGAIISampler() + case "nsgaiii": + sampler = optuna.samplers.NSGAIIISampler() + case "qmc": + sampler = optuna.samplers.QMCSampler() + case _: + raise ValueError( + f"Unknown sampler name: {self.search_config['search_strategy']['sampler']}" + ) + logger.info(f"Using sampler: {sampler.__class__.__name__}") + study = optuna.create_study( + directions=["maximize", "maximize", "maximize", "maximize"], + sampler=sampler, + ) + + # sample configs + q_config_seed = self.search_config["search_space"]["quant_config_seed"] + + study.optimize( + func=partial( + objective, + quant_config_seed=q_config_seed, + seq_len=profiler_seq_len, + tasks=tasks, + num_fewshot=num_fewshot, + batch_size=batch_size, + max_batch_size=max_batch_size, + device=device, + limit=num_samples_per_trial, + stat_profile=stat_profile, + range_entry=range_entry, + file_logger=self.logger, + ), + n_trials=self.search_config["search_strategy"]["n_trials"], + n_jobs=self.search_config["search_strategy"]["n_jobs"], + timeout=self.search_config["search_strategy"].get("timeout", None), + show_progress_bar=True, + callbacks=[logger_callback], + ) + + self.save_study_and_results(study, stat_profile, range_entry) + return study + + @staticmethod + def save_trial_to_quant_config( + trial: optuna.trial.FrozenTrial, + q_config_parser: callable, + q_config_formatter: callable, + num_hidden_layers: int, + stat_profile: dict, + range_entry: str, + save_path: str = None, + ): + def parse_and_create_item(quant_config: dict, keys: list[str], value): + for i, key in enumerate(keys): + if key not in quant_config: + quant_config[key] = {} + if i == len(keys) - 1: + quant_config[key] = value + else: + quant_config = quant_config[key] + + params = trial.params + + sampled_config = {} + for name, value in params.items(): + keys = name.removeprefix("root:").split(":") + if isinstance(value, str) and value.startswith("!ast!"): + value = ast.literal_eval(value.removeprefix("!ast!")) + parse_and_create_item(sampled_config, keys, value) + # here we got sampled_config = self.q_config_sampler in self.search + + sampled_config = q_config_parser( + sampled_config, num_hidden_layers, strict=False + ) + flattened_sampled_config = {} + flatten_dict(sampled_config, new_d=flattened_sampled_config) + + sampled_config_complete = sampled_config + sampled_config = transform_stat_profile_to_int_quant_config( + stat_profile, + range_entry=range_entry, + width=flattened_sampled_config, + frac_choices=None, + root_name="root", + is_ptq=True, + bypass=False, + ) + q_config_formatter( + sampled_config, + num_hidden_layers, + default_config=sampled_config_complete, + is_ptq=True, + bypass=False, + ) + + if save_path is not None: + save_config(sampled_config, save_path) + return sampled_config + + @staticmethod + def get_result_df( + study: optuna.Study, + q_config_parser: callable, + q_config_formatter: callable, + num_hidden_layers: int, + stat_profile: dict, + range_entry: str, + save_dir, + alpha_acc: float, + alpha_mem_density: float, + alpha_fps: float, + alpha_fps_per_lut: float, + compare_to: int, + ) -> pd.DataFrame: + result_df = pd.DataFrame( + columns=[ + "trial_id", + "accuracy", + "memory_density", + "fps", + "fps_per_lut", + "scaled_accuracy", + "scaled_memory_density", + "scaled_fps", + "scaled_fps_per_lut", + "quant_config_path", + "avg_bitwidth", + "quant_config", + "datetime_start", + "datetime_end", + ] + ) + quant_config_dir = save_dir / "quant_configs" + quant_config_dir.mkdir(parents=True, exist_ok=True) + for i, trial in enumerate(study.best_trials): + trial_id = trial.number + quant_config_path = quant_config_dir / f"quant_config_{i}.toml" + quant_config = ( + SearchIntQuantisationForPromptingCLS.save_trial_to_quant_config( + trial, + q_config_parser=q_config_parser, + q_config_formatter=q_config_formatter, + num_hidden_layers=num_hidden_layers, + stat_profile=stat_profile, + range_entry=range_entry, + save_path=quant_config_path, + ) + ) + ( + scaled_acc, + scaled_mem_density, + scaled_fps, + scaled_fps_per_lut, + ) = trial.values + acc = scaled_acc / (alpha_acc + 1e-8) + mem_density = scaled_mem_density / (alpha_mem_density + 1e-8) + fps = scaled_fps / (alpha_fps + 1e-8) + fps_per_lut = scaled_fps_per_lut / (alpha_fps_per_lut + 1e-8) + avg_bitwidth = compare_to / mem_density + result_df.loc[i] = [ + trial_id, + acc, + mem_density, + fps, + fps_per_lut, + scaled_acc, + scaled_mem_density, + scaled_fps, + scaled_fps_per_lut, + quant_config_path, + avg_bitwidth, + quant_config, + trial.datetime_start.strftime("%Y-%m-%d %H:%M:%S"), + trial.datetime_complete.strftime("%Y-%m-%d %H:%M:%S"), + ] + result_df = result_df.sort_values(by="accuracy", ascending=False) + return result_df + + def save_study_and_results(self, study: optuna.Study, stat_profile, range_entry): + save_dir = Path(self.save_dir) + save_dir.mkdir(parents=True, exist_ok=True) + study_path = save_dir / "study.pkl" + result_table_path = save_dir / "results.csv" + search_config_path = save_dir / "search_config.toml" + save_config(self.search_config, search_config_path) + + # fmt: off + result_df = SearchIntQuantisationForPromptingCLS.get_result_df( + study, + q_config_parser=self.q_config_parser, + q_config_formatter=self.q_config_formatter, + num_hidden_layers=self.model_config.num_hidden_layers, + stat_profile=stat_profile, + range_entry=range_entry, + save_dir=save_dir, + alpha_acc=self.search_config["search_estimator"]["alpha_accuracy"], + alpha_mem_density=self.search_config["search_estimator"]["alpha_memory_density"], + alpha_fps=self.search_config["search_estimator"]["alpha_fps"], + alpha_fps_per_lut=self.search_config["search_estimator"]["alpha_fps_per_lut"], + compare_to=self.search_config["search_estimator"]["compare_to"], + ) + # fmt: on + result_df.drop("quant_config", axis=1).to_csv(result_table_path, index=False) + joblib.dump(study, study_path) + logger.info("========== Best Trials ==========") + logger.info( + f"(alpha_accuracy, alpha_memory_density, alpha_fps) = " + f"{self.search_config['search_estimator']['alpha_accuracy']}, " + f"{self.search_config['search_estimator']['alpha_memory_density']}, " + f"{self.search_config['search_estimator']['alpha_fps']}, " + f"{self.search_config['search_estimator']['alpha_fps_per_lut']}," + ) + + result_df = result_df.drop("quant_config", axis=1) + result_df["quant_config_name"] = result_df["quant_config_path"].apply( + lambda x: "$save_dir/quant_configs/" + str(Path(x).name) + ) + result_df = result_df.applymap( + lambda x: f"{x:.4f}" if isinstance(x, float) else x + ) + result_df = result_df.drop("quant_config_path", axis=1) + logger.info( + "\n" + + tabulate( + result_df, + headers="keys", + tablefmt="pretty", + ) + ) + logger.info(f"Results saved to {save_dir}") + logger.info(f"Study saved to {study_path}") + + def evaluate_best_trials( + self, + study: optuna.Study, + tasks: list[str], + num_fewshot: int, + batch_size: int, + max_batch_size: int, + device: str, + stat_profile: dict, + range_entry: str, + ): + # fmt: off + acc_threshold = self.search_config["search_strategy"]["accuracy_threshold"] + avg_bitwidth_threshold = self.search_config["search_strategy"]["avg_bitwidth_threshold"] + fps_threshold = self.search_config["search_strategy"]["fps_threshold"] + fps_per_lut_threshold = self.search_config["search_strategy"]["fps_per_lut_threshold"] + # fmt: on + sort_by = self.search_config["search_strategy"]["sort_by"] + + for i, s in enumerate(sort_by): + assert s in [ + "accuracy", + "avg_bitwidth", + "fps", + "fps_per_lut", + ], f"Unknown sort_by: {s}, must be one of ['accuracy', 'avg_bitwidth', 'fps', 'fps_per_lut']" + # fmt: off + result_df = SearchIntQuantisationForPromptingCLS.get_result_df( + study, + q_config_parser=self.q_config_parser, + q_config_formatter=self.q_config_formatter, + num_hidden_layers=self.model_config.num_hidden_layers, + stat_profile=stat_profile, + range_entry=range_entry, + save_dir=self.save_dir, + alpha_acc=self.search_config["search_estimator"]["alpha_accuracy"], + alpha_mem_density=self.search_config["search_estimator"]["alpha_memory_density"], + alpha_fps=self.search_config["search_estimator"]["alpha_fps"], + alpha_fps_per_lut=self.search_config["search_estimator"]["alpha_fps_per_lut"], + compare_to=self.search_config["search_estimator"]["compare_to"], + ) + # fmt: on + + filtered_df = result_df.loc[result_df["accuracy"] >= acc_threshold] + filtered_df = filtered_df.loc[ + filtered_df["avg_bitwidth"] <= avg_bitwidth_threshold + ] + filtered_df = filtered_df.loc[filtered_df["fps"] >= fps_threshold] + filtered_df = filtered_df.loc[ + filtered_df["fps_per_lut"] >= fps_per_lut_threshold + ] + if len(filtered_df) == 0: + logger.warning( + f"No trials found with acc >= {acc_threshold}, avg_bitwidth <= {avg_bitwidth_threshold}, fps >= {fps_threshold}, fps_per_lut >= {fps_per_lut_threshold}" + ) + return + + ascending_mapping = { + "accuracy": False, + "avg_bitwidth": True, + "fps": False, + "fps_per_lut": False, + } + + filtered_df = filtered_df.sort_values( + sort_by, ascending=[ascending_mapping[s] for s in sort_by] + ) + + best_quant_config = filtered_df.iloc[0]["quant_config"] + save_config(best_quant_config, self.save_dir / "best_quant_config.toml") + + logger.info("========== Evaluating the Best ==========") + results = eval_prompting_tasks( + model_wrapper="llm-mixed-q", + model_arch=self.model_arch, + model_name=self.model_name, + quant_config=best_quant_config, + tasks=tasks, + num_fewshot=num_fewshot, + batch_size=batch_size, + max_batch_size=max_batch_size, + device=device, + no_cache=True, + ) + results["trial_id"] = str(filtered_df.iloc[0]["trial_id"]) + with open(self.save_dir / "best_eval.json", "w") as f: + json.dump(results, f, indent=4) + + logger.info( + f"Best quant config avg bitwidth: {filtered_df.iloc[0]['avg_bitwidth']: .2f}" + ) + logger.info( + f"Best quant config software metric: {pformat(results)}, saved to {self.save_dir / 'best_eval.json'})" + ) diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/statstic_profiler/__init__.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/statstic_profiler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3c3b9fa7654fdede5dd44583c25b41fe39280042 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/statstic_profiler/__init__.py @@ -0,0 +1,3 @@ +from .stat_manager import StatManager +from .stat_profiler import (profile_statistics_cls_glue, + profile_statistics_lm_fn) diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/statstic_profiler/stat_manager.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/statstic_profiler/stat_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..0d5be1101c7c1908849baff3ddc8702322918938 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/statstic_profiler/stat_manager.py @@ -0,0 +1,154 @@ +import torch +from tqdm import tqdm + +from .stats import _StatBase, create_new_stat + + +class ActStatCollection: + def __init__(self, stats: list[str] | dict[str, dict]): + self.stats: list[_StatBase] = [] + if isinstance(stats, (list, tuple)): + for stat_name in stats: + self.stats.append(create_new_stat(stat_name)) + elif isinstance(stats, dict): + for stat_name, stat_kwargs in stats.items(): + self.stats.append(create_new_stat(stat_name, **stat_kwargs)) + else: + raise ValueError(f"Unknown type of stats: {type(stats)}") + + def update(self, batch: torch.Tensor): + assert isinstance( + batch, torch.Tensor + ), f"batch must be a Tensor, got {type(batch)}" + for stat in self.stats: + if hasattr(stat, "update_a_batch"): + stat.update_a_batch(batch) + else: + for i in range(batch.size(0)): + stat.update_a_sample(batch[[i], ...]) + + def compute(self) -> dict: + results = {} + for stat in self.stats: + results.update(stat.export()) + return results + + def __repr__(self) -> str: + return "ActStatCollection(stats={})".format( + ", ".join([type(stat).__name__ for stat in self.stats]) + ) + + +class WeightStatCollection: + def __init__(self, stats: list[str] | dict[str, dict]) -> None: + self.stats: list[_StatBase] = [] + if isinstance(stats, dict): + for stat_name, stat_config in stats.items(): + self.stats.append(create_new_stat(stat_name, **stat_config)) + elif isinstance(stats, (list, tuple)): + for stat_name in stats: + self.stats.append(create_new_stat(stat_name)) + else: + raise ValueError(f"Unknown type of stats: {type(stats)}") + + def update(self, weight: torch.Tensor): + assert isinstance(weight, torch.Tensor) + for stat in self.stats: + stat.update_a_sample(weight) + + def compute(self) -> dict[str, dict[str, list]]: + results = {} + for stat in self.stats: + results.update(stat.export()) + + return results + + def __repr__(self) -> str: + return "WeightStatCollection(stats={})".format( + ", ".join([type(stat).__name__ for stat in self.stats]) + ) + + +class StatManager: + def __init__( + self, + act_stats: tuple[str] | dict[str, dict], + weight_stats: tuple[str] | dict[str, dict], + ) -> None: + self.act_stats = act_stats + self.weight_stats = weight_stats + + self.registered_stats = {} + self.weight_collect_updated = {} + + def get_pre_forward_act_hook(self, name: str) -> callable: + assert ( + name not in self.registered_stats + ), f"The name `{name}` has been registered for a collection of input activations" + new_act_clc = ActStatCollection(self.act_stats) + self.registered_stats[name] = new_act_clc + + def hook(module: torch.nn.Module, input: tuple) -> None: + new_act_clc.update(input[0]) + return None + + return hook + + def get_post_forward_act_hook(self, name: str) -> callable: + assert ( + name not in self.registered_stats + ), f"The name `{name}` has been registered for a collection of output activations" + new_act_clc = ActStatCollection(self.act_stats) + self.registered_stats[name] = new_act_clc + + def hook(module: torch.nn.Module, input: tuple, output: tuple) -> None: + new_act_clc.update(output[0]) + return None + + return hook + + def get_pre_forward_weight_hook(self, name: str, weight_name: str) -> callable: + assert ( + name not in self.registered_stats + ), f"The name `{name}` has been registered for a collection of weights" + + new_weight_clc = WeightStatCollection(self.weight_stats) + self.registered_stats[name] = new_weight_clc + self.weight_collect_updated[name] = False + + def hook(module: torch.nn.Module, input: tuple) -> None: + weight = getattr(module, weight_name) + if self.weight_collect_updated[name]: + pass + else: + new_weight_clc.update(weight) + self.weight_collect_updated[name] = True + return None + + return hook + + def finalize(self, show_progress_bar: bool = False) -> dict[str, dict[str, dict]]: + """ + { + : { + : {...} + } + } + + is the name of the registered stat collection. + """ + if show_progress_bar: + stat_iter = tqdm( + self.registered_stats.items(), + desc="Finalizing stats", + total=len(self.registered_stats), + ) + else: + stat_iter = self.registered_stats.items() + results = {} + for name, stat in stat_iter: + delta = {name: stat.compute()} + results.update(delta) + if show_progress_bar: + stat_iter.update(1) + return results diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/statstic_profiler/stat_profiler.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/statstic_profiler/stat_profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..b65fc930adcb52c609f64f689edfa569cf3a7208 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/statstic_profiler/stat_profiler.py @@ -0,0 +1,81 @@ +import logging + +from ..eval import eval_cls_glue, eval_lm_wikitext2 +from .stat_manager import StatManager + +logger = logging.getLogger(__name__) + + +def profile_statistics_cls_glue( + act_stats: tuple[str] | dict[str, dict], + weight_stats: tuple[str] | dict[str, dict], + hook_registration_fn: callable, + model, + task: str, + eval_dataloader, + is_regression: bool, + num_samples: int, + root_name: str = "root", + show_progress_bar: bool = True, +): + """ + This function is used to profile the statistics of the activations and weights of the model. + The statistics are collected by the hooks registered by the hook_registration_fn. + + Args: + act_stats (tuple[str]): A tuple of strings, each of which is the name of an activation statistic. + weight_stats (tuple[str]): A tuple of strings, each of which is the name of a weight statistic. + hook_registration_fn (callable): A function that registers hooks to the model. + + ---- + hook_registration_fn should have the following signature: + def hook_registration_fn(stat_manager: StatManager, model, root_name: str, num_hidden_layers: int)->None: + """ + + stat_manager = StatManager(act_stats, weight_stats) + hook_registration_fn( + stat_manager=stat_manager, + model=model, + name=root_name, + num_hidden_layers=model.config.num_hidden_layers, + ) + eval_cls_glue( + model=model, + task=task, + eval_dataloader=eval_dataloader, + is_regression=is_regression, + num_samples=num_samples, + progress_bar=True, + ) + stat_profile = stat_manager.finalize(show_progress_bar=show_progress_bar) + return stat_profile + + +def profile_statistics_lm_fn( + act_stats: tuple[str], + weight_stats: tuple[str], + hook_registration_fn: callable, + model, + eval_dataloader, + num_samples: int, + input_device: str, + root_name: str = "root", + show_progress_bar: bool = True, +): + stat_manager = StatManager(act_stats, weight_stats) + hook_registration_fn( + stat_manager=stat_manager, + model=model, + name=root_name, + num_hidden_layers=model.config.num_hidden_layers, + ) + + eval_lm_wikitext2( + model=model, + eval_dataloader=eval_dataloader, + num_samples=num_samples, + progress_bar=True, + input_device=input_device, + ) + stat_profile = stat_manager.finalize(show_progress_bar=show_progress_bar) + return stat_profile diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/statstic_profiler/stats.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/statstic_profiler/stats.py new file mode 100644 index 0000000000000000000000000000000000000000..41d24dbba587c684a167e7a1d21bce5f9e2d9d95 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/statstic_profiler/stats.py @@ -0,0 +1,420 @@ +import logging +import math +from typing import Literal + +import numpy as np +import torch +from numpy import ndarray +from torch import Tensor + +logger = logging.getLogger(__name__) + +STAT_NAME_TO_CLS = {} + + +def _add_to_stat_mapping(cls): + global STAT_NAME_TO_CLS + assert issubclass(cls, _StatBase) + STAT_NAME_TO_CLS[cls.name] = cls + return cls + + +class _StatBase: + name: str = None + + def __init__(self) -> None: + pass + + @torch.no_grad() + def update_a_sample(self, *args, **kwargs) -> None: + """ + Update the stat with a new sample. + + If the sample is a Tensor, it will be detached and copied to the device specified in the constructor. + If the sample is a list, tuple, int, or float, it will be converted to a Tensor. + """ + raise NotImplementedError + + @torch.no_grad() + def compute(self) -> dict[str, Tensor]: + """ + Compute/finalize the stat and return a dict of results. + + The results should be a dict of Tensors. + """ + raise NotImplementedError + + def export(self) -> dict[str, dict[str, list]]: + """ + Export the stat to a dict of dict of lists. + + This method calls compute() and converts the results to lists, which is more friendly to toml serialization. + """ + results = self.compute() + return { + self.name: { + k: v.tolist() if isinstance(v, (Tensor)) else v + for k, v in results.items() + } + } + + def __repr__(self) -> str: + return type(self).__name__.capitalize() + + +@_add_to_stat_mapping +class Record(_StatBase): + """ + Record all samples passed in + + Args: + device (str|None): the device to move the samples to. If None, the samples will not be moved. + add_new_dim_before_concat (bool): if True, add a new dimension before concatenating the samples. + """ + + name = "record" + + def __init__(self, device=None, add_new_dim_before_concat: bool = False) -> None: + super().__init__() + self.device = device + self.add_new_dim: bool = add_new_dim_before_concat + self.data: Tensor = None + self.count: int = None + self.total_size_in_bytes: float = None + + @torch.no_grad() + def update_a_sample(self, new_s: Tensor): + if isinstance(new_s, (list, tuple, int, float)): + new_s = torch.tensor(new_s).float() + assert isinstance(new_s, Tensor) + new_s = new_s.clone().detach().float() + if self.device is not None: + new_s = new_s.to(self.device) + if self.add_new_dim: + new_s = new_s.unsqueeze(0) + if self.data is None: + self.data = new_s + self.count = 1 + else: + self.data = torch.concat((self.data, new_s), dim=0) + self.count += 1 + self.total_size_in_bytes = self.data.element_size() * self.data.nelement() + + @torch.no_grad() + def compute(self) -> dict: + return { + "data": self.data, + "count": self.count, + "size_in_bytes": self.total_size_in_bytes, + } + + +@_add_to_stat_mapping +class VarianceOnline(_StatBase): + """ + Use Welford's online algorithm to calculate running variance and mean + + This saves memory by not storing all the samples, but the variance is not precise when the count is small. + + https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm + + --- + Args: + device (str|None): the device to move the samples to. If None, the samples will not be moved. + dims (str|list|None): the dimensions to reduce. If "all", reduce all dimensions. If None, do not reduce any dimension. If a list, reduce the specified dimensions. + """ + + name = "variance_online" + + def __init__(self, device=None, dims: Literal["all"] | None | list = "all") -> None: + super().__init__() + self.device = device + if isinstance(dims, (list, tuple)): + # assert sorted(dims) == list( + # range(min(dims), max(dims) + 1) + # ), "dims must be consecutive" + self.dims_to_reduce = sorted(dims) + else: + assert dims in ["all", None] + self.dims_to_reduce = dims + + self.count: int = 0 + self.mean: Tensor = 0 + self.m: Tensor = 0 + + @staticmethod + def _update( + new_s: Tensor, + count: int, + mean: Tensor, + m: Tensor, + ): + count += 1 + delta = new_s - mean + mean += delta / count + m += delta * (new_s - mean) + + return count, mean, m + + @staticmethod + def _reshape_a_sample(new_s: Tensor, dims_to_reduce: list[int]): + dims_to_keep = [i for i in range(new_s.ndim) if i not in dims_to_reduce] + transpose_dims = dims_to_keep + dims_to_reduce + new_s = new_s.permute(*transpose_dims) + new_s = torch.flatten(new_s, start_dim=len(dims_to_keep), end_dim=-1) + return new_s + + @torch.no_grad() + def update_a_sample(self, new_s: Tensor): + if isinstance(new_s, (list, tuple, int, float)): + new_s = torch.tensor(new_s) + assert isinstance(new_s, Tensor) + new_s = new_s.clone().detach().float() + + if self.device is not None: + new_s = new_s.to(self.device) + + match self.dims_to_reduce: + case "all": + new_s = torch.flatten(new_s) + n_b = new_s.nelement() + mean_b = new_s.mean() + + delta = mean_b - self.mean + self.mean += delta * n_b / (self.count + n_b) + self.m += new_s.var() * n_b + delta**2 * self.count * n_b / ( + self.count + n_b + ) + self.count += n_b + case None: + self.count, self.mean, self.m = self._update( + new_s=new_s, + count=self.count, + mean=self.mean, + m=self.m, + ) + case _: + # self.dims_to_reduce is a list + new_s = self._reshape_a_sample( + new_s, dims_to_reduce=self.dims_to_reduce + ) + for i in range(new_s.size(-1)): + self.count, self.mean, self.m = self._update( + new_s=new_s[..., i], + count=self.count, + mean=self.mean, + m=self.m, + ) + + @torch.no_grad() + def compute(self) -> dict: + if self.count < 2: + logger.warning( + f"VarianceOnline: count is {self.count}, which is less than 2. " + "Returning NA for mean and variance." + ) + return {"mean": "NA", "variance": "NA"} + + var = self.m / self.count + return { + "mean": self.mean, + "variance": var, + "count": self.count, + } + + +@_add_to_stat_mapping +class VariancePrecise(Record): + """ + Concatenate samples and use torch.var, torch.mean to calculate variance and mean + + This is precise but may use massive memory when the count or sample size is large. + + --- + Args: + device (str|None): the device to move the samples to. If None, the samples will not be moved. + dims (str|list|None): the dimensions to reduce. If "all", reduce all dimensions. If None, do not reduce any dimension. If a list, reduce the specified dimensions. + """ + + name = "variance_precise" + + def __init__(self, device=None, dims: Literal["all"] | None | list = "all") -> None: + super().__init__(device=device, add_new_dim_before_concat=True) + self.dims_to_reduce = dims + + @torch.no_grad() + def update_a_sample(self, new_s: Tensor): + super().update_a_sample(new_s=new_s) + + @torch.no_grad() + def compute(self) -> dict[str, ndarray]: + match self.dims_to_reduce: + case "all": + var = torch.var(self.data) + mean = torch.mean(self.data) + count = self.data.nelement() + case None: + count = self.data.size(0) + if self.data.size(0) < 2: + logger.warning( + f"VariancePrecise: count is {self.data.size(0)}, which is less than 2. " + "Returning NA for mean and variance." + ) + mean = "NA" + var = "NA" + else: + var = torch.var(self.data, dim=0) + mean = torch.mean(self.data, dim=0) + case _: + dims_to_reduce = [i + 1 for i in self.dims_to_reduce] + count = self.data[dims_to_reduce].nelements() + if self.data[dims_to_reduce].nelements() < 2: + logger.warning( + f"VariancePrecise: count is {self.data[dims_to_reduce].nelements()}, which is less than 2. " + "Returning NA for mean and variance." + ) + var = "NA" + mean = "NA" + else: + var = torch.var(self.data, dim=[0] + dims_to_reduce) + mean = torch.mean(self.data, dim=[0] + dims_to_reduce) + return {"mean": mean, "variance": var, "count": count} + + +@_add_to_stat_mapping +class RangeMinMax(_StatBase): + """ + Calculate the range of samples based on the min and max values. + + --- + Args: + device (str|None): the device to move the samples to. If None, the samples will not be moved. + dims (str|list|None): the dimensions to reduce. If "all", reduce all dimensions. If None, do not reduce any dimension. If a list, reduce the specified dimensions. + abs (bool): if True, take the absolute value of the samples before calculating the min and max. + """ + + name = "range_min_max" + + def __init__( + self, device=None, dims: Literal["all"] | list | None = "all", abs: bool = False + ) -> None: + super().__init__() + self.device = device + self.dims = dims + self.abs = abs + self.min = None + self.max = None + self.count = 0 + + @torch.no_grad() + def update_a_sample(self, new_s: Tensor): + if isinstance(new_s, (list, tuple, int, float)): + new_s = torch.tensor(new_s).float() + new_s = new_s.clone().detach().float() + if self.device: + new_s = new_s.to(self.device) + + if self.abs: + new_s = torch.abs(new_s) + + if self.min is None: + match self.dims: + case None: + self.min = new_s + self.max = new_s + self.count += 1 + case "all": + self.min = torch.min(new_s) + self.max = torch.max(new_s) + self.count += new_s.nelement() + case _: + n_elem = 1 + for dim in self.dims: + n_elem *= new_s.size(dim) + self.min = torch.min(new_s, dim=dim) + self.max = torch.max(new_s, dim=dim) + self.count += n_elem + else: + match self.dims: + case None: + self.min = torch.min(self.min, new_s) + self.max = torch.max(self.max, new_s) + self.count += 1 + case "all": + self.min = torch.min(self.min, torch.min(new_s)) + self.max = torch.max(self.max, torch.max(new_s)) + self.count += new_s.nelement() + case _: + n_elem = 1 + for dim in self.dims: + n_elem *= new_s.size(dim) + self.min = torch.min(self.min, torch.min(new_s, dim=dim)) + self.max = torch.max(self.max, torch.max(new_s, dim=dim)) + self.count += n_elem + + def compute(self) -> dict: + if self.count < 2: + logger.warning( + f"RangeMinMax: count is {self.count}, which is less than 2. " + "Returning NA for min and max." + ) + minimum = "NA" + maximum = "NA" + d_range = "NA" + else: + minimum = self.min + maximum = self.max + d_range = self.max - self.min + return {"min": minimum, "max": maximum, "range": d_range, "count": self.count} + + +@_add_to_stat_mapping +class ThresholdCount(_StatBase): + name = "threshold_count" + + def __init__( + self, device=None, threshold: float = 6.0, dims: list[int] | tuple[int] = None + ) -> None: + super().__init__() + self.device = device + self.threshold = threshold + self.dims = dims + self.n_outliers = 0 + self.total = 0 + self.n_samples = 0 + + @torch.no_grad() + def update_a_sample(self, new_s: Tensor) -> None: + assert isinstance(new_s, Tensor) + if self.device is None: + self.device = new_s.device + new_s = new_s.clone().detach().float().to(self.device) + + comp = torch.abs(new_s) > self.threshold + if self.dims is not None: + self.n_outliers += torch.sum(comp, dim=self.dims) + self.total += np.prod([new_s.size(dim) for dim in self.dims]) + self.n_samples += 1 + else: + self.n_outliers += torch.sum(comp) + self.total += new_s.nelement() + self.n_samples += 1 + + def compute(self) -> dict: + return { + "num_outliers": self.n_outliers.cpu().tolist() + if isinstance(self.n_outliers, Tensor) + else self.n_outliers, + "total": self.total, + "threshold": self.threshold, + "num_samples": self.n_samples, + } + + +def create_new_stat(stat_name: str, **stat_kwargs): + global STAT_NAME_TO_CLS + assert ( + stat_name in STAT_NAME_TO_CLS + ), f"Unknown stat name: {stat_name}. Available stat names: {list(STAT_NAME_TO_CLS.keys())}" + stat_cls = STAT_NAME_TO_CLS[stat_name] + return stat_cls(**stat_kwargs) diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/__init__.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c7ec7faf50a7227f74765ef8522af333a734ee63 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/__init__.py @@ -0,0 +1,4 @@ +from .config_load import convert_none_to_str_na, convert_str_na_to_none, load_config, save_config +from .dict_tools import expand_dict, flatten_dict +from .logger import set_logging_verbosity +from .trial_extractor import extract_quant_config diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/config_load.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/config_load.py new file mode 100644 index 0000000000000000000000000000000000000000..87db0089ed993a2a6f97807b4ee40b763d729382 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/config_load.py @@ -0,0 +1,59 @@ +from pathlib import Path + +import toml + + +def convert_str_na_to_none(d): + """ + Since toml does not support None, we use "NA" to represent None. + """ + if isinstance(d, dict): + for k, v in d.items(): + d[k] = convert_str_na_to_none(v) + elif isinstance(d, list): + d = [convert_str_na_to_none(v) for v in d] + elif isinstance(d, tuple): + d = tuple(convert_str_na_to_none(v) for v in d) + else: + if d == "NA": + return None + else: + return d + return d + + +def convert_none_to_str_na(d): + """ + Since toml does not support None, we use "NA" to represent None. + Otherwise the none-value key will be missing in the toml file. + """ + if isinstance(d, dict): + for k, v in d.items(): + d[k] = convert_none_to_str_na(v) + elif isinstance(d, list): + d = [convert_none_to_str_na(v) for v in d] + elif isinstance(d, tuple): + d = tuple(convert_none_to_str_na(v) for v in d) + else: + if d is None: + return "NA" + else: + return d + return d + + +def load_config(config_path): + """Load from a toml config file and convert "NA" to None.""" + with open(config_path, "r") as f: + config = toml.load(f) + config = convert_str_na_to_none(config) + return config + + +def save_config(config, config_path): + """Convert None to "NA" and save to a toml config file.""" + config = convert_none_to_str_na(config) + path = Path(config_path) + path.parent.mkdir(parents=True, exist_ok=True) + with open(config_path, "w") as f: + toml.dump(config, f) diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/dict_tools.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/dict_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..be50debc8757ebde4fa381d1c2f6002d75fe82d1 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/dict_tools.py @@ -0,0 +1,89 @@ +def flatten_dict(d: dict, new_d: dict, join: str = ":", name: str = "root") -> dict: + """ + Flatten a nested dict to a flat dict with keys joined by `join`. + + --- + For example: + ```python + d = { + "a": 1, + "b": { + "c": 2, + "d": { + "e": 3, + "f": 4, + }, + }, + } + new_d = {} + flatten_dict(d, new_d, join=":", name="root") + print(new_d) + ``` + will print + ```text + { + "root:a": 1, + "root:b:c": 2, + "root:b:d:e": 3, + "root:b:d:f": 4, + } + ``` + """ + for k, v in d.items(): + if isinstance(v, dict): + flatten_dict(v, new_d, join, f"{name}{join}{k}") + else: + new_d[f"{name}{join}{k}"] = v + + +def expand_dict(d: dict, new_d: dict, join: str = ":", name: str = "root"): + """ + Expand a flat dict to a nested dict with keys joined by `join`. + + --- + For example: + ```python + d = { + "root:a": 1, + "root:b:c": 2, + "root:b:d:e": 3, + "root:b:d:f": 4, + } + new_d = {} + expand_dict(d, new_d, join=":", name="root") + print(new_d) + ``` + will print + ```text + { + "a": 1, + "b": { + "c": 2, + "d": { + "e": 3, + "f": 4, + }, + }, + } + ``` + """ + + def create_nested_dict(d: dict, key_list: list[str], value): + if len(key_list) == 1: + if key_list[0] not in d: + d[key_list[0]] = value + elif isinstance(d[key_list[0]], dict): + d[key_list[0]].update(value) + else: + raise ValueError( + f"Cannot create nested dict at {key_list} with value {value}" + ) + else: + if key_list[0] not in d: + d[key_list[0]] = {} + create_nested_dict(d[key_list[0]], key_list[1:], value) + + for k, v in d.items(): + k: str + key_list = k.removeprefix(f"{name}{join}").split(join) + create_nested_dict(new_d, key_list, v) diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/gpu_hours_estimate.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/gpu_hours_estimate.py new file mode 100644 index 0000000000000000000000000000000000000000..1857bc9cc0b4ba272211948c942d9df5d812c40a --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/gpu_hours_estimate.py @@ -0,0 +1,37 @@ +import wandb +from datetime import datetime + +# Initialize the API +api = wandb.Api() + +entity = 'deepwok' +project = 'llm_q_scaling_law' + +# Query all runs in the project +runs = api.runs(f"{entity}/{project}") + +# Initialize the total GPU time counter +total_gpu_time = 0 +total_runs = 0 +# Loop through each run + +for run in runs: + total_runs += 1 + # Calculate the run duration (in seconds) + if run.created_at and "_wandb" in run.summary: + # check whether the run.summary["_wandb"] exists + duration_seconds = run.summary["_wandb"].get("runtime", 0) # default to 0 if for job not logged any metric + + # Get the number of GPUs from the system metadata + # this is only an estimate, as the actual number of GPUs used may vary + system_info = run.summary.get("_wandb", {}).get("system", {}) + num_gpus = system_info.get("gpu_count", 1) # Default to 1 if not found + + # Accumulate total GPU time + total_gpu_time += duration_seconds * num_gpus + +# Convert the total GPU time from seconds to hours +total_gpu_hours = total_gpu_time / 3600 + +print(f"Total number of runs: {total_runs}") +print(f"Total GPU usage for project: {total_gpu_hours:.2f} hours") \ No newline at end of file diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/logger.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..e006595127cb3a018e933f48bc08be1d8213f574 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/logger.py @@ -0,0 +1,42 @@ +import logging + +from colorlog import ColoredFormatter + +formatter = ColoredFormatter( + "%(log_color)s%(levelname)-8s%(reset)s %(blue)s%(message)s", + datefmt=None, + reset=True, + log_colors={ + "DEBUG": "cyan", + "INFO": "green", + "WARNING": "yellow", + "ERROR": "red", + "CRITICAL": "red,bg_white", + }, + secondary_log_colors={}, + style="%", +) + +handler = logging.StreamHandler() +handler.setFormatter(formatter) + +root_logger = logging.getLogger("llm_q_scaling_law") +root_logger.addHandler(handler) + + +def set_logging_verbosity(level: str = "info"): + level = level.lower() + match level: + case "debug": + root_logger.setLevel(logging.DEBUG) + case "info": + root_logger.setLevel(logging.INFO) + case "warning": + root_logger.setLevel(logging.WARNING) + case "error": + root_logger.setLevel(logging.ERROR) + case "critical": + root_logger.setLevel(logging.CRITICAL) + case _: + raise ValueError(f"Unknown logging level: {level}, should be one of: debug, info, warning, error, critical") + root_logger.info(f"Set logging level to {level}") diff --git a/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/trial_extractor.py b/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/trial_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..5b912c197230a19ebcba91c27df422bed7637283 --- /dev/null +++ b/llm-q-scaling-law-master/src/llm_q_scaling_law/utils/trial_extractor.py @@ -0,0 +1,47 @@ +import ast +import logging +from pprint import pformat + +import joblib +import optuna + +from .config_load import save_config + +logger = logging.getLogger(__name__) + + +def save_trial_to_quant_config(trial: optuna.trial.FrozenTrial, save_path: str = None): + def parse_and_create_item(quant_config: dict, keys: list[str], value): + for i, key in enumerate(keys): + if key not in quant_config: + quant_config[key] = {} + if i == len(keys) - 1: + quant_config[key] = value + else: + quant_config = quant_config[key] + + params = trial.params + + quant_config = {} + for name, value in params.items(): + keys = name.removeprefix("root:").split(":") + if isinstance(value, str) and value.startswith("!ast!"): + value = ast.literal_eval(value.removeprefix("!ast!")) + parse_and_create_item(quant_config, keys, value) + if save_path is not None: + save_config(quant_config, save_path) + return quant_config + + +def extract_quant_config( + study: optuna.Study | str, + target_idx, + save_path: str = None, +): + if not isinstance(study, optuna.Study): + with open(study, "rb") as f: + study: optuna.Study = joblib.load(f) + + target_trial = study.trials[target_idx] + quant_config = save_trial_to_quant_config(target_trial, save_path) + return quant_config diff --git a/llm-q-scaling-law-master/src/main.py b/llm-q-scaling-law-master/src/main.py new file mode 100644 index 0000000000000000000000000000000000000000..a4702f0e4783f8b465be7d75c35d2d7f20c7c7f0 --- /dev/null +++ b/llm-q-scaling-law-master/src/main.py @@ -0,0 +1,124 @@ +import logging +import os +from argparse import ArgumentParser +from pprint import pformat +from pathlib import Path +from transformers import set_seed +from transformers.utils.logging import set_verbosity_error, set_verbosity_info +import sys +import wandb + +sys.path.append(Path(__file__).parent.joinpath("lm-evaluation-harness").resolve().as_posix()) + +from llm_q_scaling_law.search import SearchRunner +from llm_q_scaling_law.utils import load_config, save_config + +logger = logging.getLogger(__file__) +# set logging verbosity +logger.setLevel(logging.INFO) + +os.environ["PYTHONBREAKPOINT"] = "ipdb.set_trace" +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +# def cli_search_quantization_on_cls_glue(): +def cli(): + parser = ArgumentParser() + parser.add_argument("--model_arch", type=str, required=False, default=None) + parser.add_argument("--model_name", type=str, required=True) + parser.add_argument("--q_ratio", type=float, default=None) + parser.add_argument("--search_config", type=str, required=True) + parser.add_argument("--padding", type=str, default="max_length") + parser.add_argument("--max_length", type=int, default=1024) + parser.add_argument( + "--search_dataset_split", + type=str, + default="train", + choices=["train", "validation", "test"], + ) + parser.add_argument( + "--eval_dataset_split", + type=str, + default="validation", + choices=["train", "validation", "test"], + ) + parser.add_argument("--accelerator", type=str, default="cuda:0") + parser.add_argument("--model_parallel", action="store_true") + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--save_dir", type=str, required=True) + parser.add_argument("--disable_wandb", action="store_true", default=False) + parser.add_argument("--wandb_name", type=str, default=None) + parser.add_argument("--wandb_group", type=str, default=None) + parser.add_argument("--local_hf_cache", type=str, default=None) + args = parser.parse_args() + + search_config = load_config(args.search_config) + + if not args.disable_wandb: + run = wandb.init( + project="llm_q_scaling_law", + entity="deepwok", + config=search_config, + tags=[args.model_arch, args.model_name.replace("/", "_")], + name=args.wandb_name, + group=args.wandb_group, + ) + wandb.define_metric("avg_metric") + wandb.define_metric("saved_cost") + wandb.define_metric("eval_metric") + + if args.q_ratio is not None: + # allow overriding the q_ratio from the command line + logger.info("overwriting q_ratio from command line: {}".format(args.q_ratio)) + search_config["setup"]["ratio"] = args.q_ratio + + + logger.info("==================== Args ====================") + logger.info(pformat(vars(args))) + logger.info("search_config:\n {}".format(pformat(search_config))) + logger.info("==================== Search Starts ====================") + + if args.seed is not None: + set_seed(args.seed) + + search_runner = SearchRunner( + model_arch=args.model_arch, + model_name=args.model_name, + seq_len=args.max_length, + search_config=search_config, + save_dir=args.save_dir, + device=args.accelerator, + model_parallel=args.model_parallel, + enable_wandb=not args.disable_wandb, + local_hf_cache=args.local_hf_cache, + ) + + search_runner.search() + save_config(search_config, args.save_dir + "/search_config.toml") + logger.info("==================== Search Ends ====================") + + + # perform additional evaluation on different benchmark if specified through [evaluation] section in search_config + if "evaluation" in search_config and len(search_config["evaluation"]["tasks"]) > 0: + logger.info("==================== Evaluation Starts ====================") + best_eval_metric, best_search_metric, best_trial_idx, full_eval_results = search_runner.evaluate() + logger.info("Best eval metric: {}, search metric: {}, trail: {}".format(round(best_eval_metric,4), round(best_search_metric,4), best_trial_idx)) + if not args.disable_wandb: + wandb.log({"eval_metric": best_eval_metric}) + logger.info("==================== Evaluation Ends ====================") + + # upload the search result and logs to wandb after the search is done + if not args.disable_wandb: + wandb.save(args.save_dir + "/search_config.toml") + wandb.save(args.save_dir + "/search_results.pkl") + wandb.save(args.save_dir + "/best_trials.pkl") + wandb.save(args.save_dir + "/study.pkl") + wandb.save(args.save_dir + "/bf16_baseline_acc.json") + wandb.save(args.save_dir + "/bf16_baseline_cost.json") + wandb.save(args.save_dir + "/search_log.csv") + wandb.save(args.save_dir + "/eval_log.csv") + wandb.save(args.save_dir + "/full_eval_result.pkl") + wandb.save(args.save_dir + "/eval_result.json") + +if __name__ == "__main__": + set_verbosity_error() + cli() diff --git a/llm-q-scaling-law-master/src/ppl_test.py b/llm-q-scaling-law-master/src/ppl_test.py new file mode 100644 index 0000000000000000000000000000000000000000..851365aebdfe10602ed43fbf54b1d6e11e235e91 --- /dev/null +++ b/llm-q-scaling-law-master/src/ppl_test.py @@ -0,0 +1,265 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling +import numpy as np +import torch +import datasets +import argparse +import os +from tqdm import tqdm +from torch.utils.data import DataLoader + +# disable warning for token length +from transformers.utils import logging +logging.set_verbosity(40) + +# this requires vllm to run, install vllm and huggingface datasets for this + +ALPACA_NON_EMPTY_PROMPTS = """ +Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. + +### Instruction: +{instruction} + +### Input: +{input} + +### Response: +""" + +ALPACA_EMPTY_PROMPTS = """ +Below is an instruction that describes a task. Write a response that appropriately completes the request. + +### Instruction: +{instruction} + +### Response: +""" + +def cli(): + parser = argparse.ArgumentParser() + parser.add_argument("--huggingface_cache", type=str) # use specific huggingface cache if specified + parser.add_argument("--model_name", type=str, default="facebook/opt-125m, facebook/opt-350m, facebook/opt-1.3b, facebook/opt-2.7b, facebook/opt-6.7b, facebook/opt-13b") # huggingface model name + parser.add_argument("--sweep", action="store_true", default=True) # whether to sweep along the model_name + parser.add_argument("--datasets", type=str, default="alpaca") # huggingface datasets name + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--sample_size", type=int, default=0) + parser.add_argument("--batch_size", type=int, default=8) + return parser.parse_args() + +def preprocess_data_module_open_alpaca( + raw_data_module, + tokenizer, + max_length, + num_proc: int, +) -> datasets.DatasetDict: + if tokenizer.pad_token in ["", None]: + tokenizer.pad_token = tokenizer.eos_token + + def tokenize_function(examples): + return tokenizer(["\n\n".join(examples["text"])]) + + encodings = raw_data_module.map( + tokenize_function, + batched=True, + remove_columns=raw_data_module["train"].column_names, + desc="Running tokenizer on dataset", + num_proc=num_proc, + ) + + def group_texts(examples): + # Concatenate all texts. + # >>> sum([[1,2,3],[4,5,6]],[]) + # [1, 2, 3, 4, 5, 6] + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= max_length: + total_length = (total_length // max_length) * max_length + # Split by chunks of block_size. + result = { + k: [t[i : i + max_length] for i in range(0, total_length, max_length)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + preprocessed = encodings.map( + group_texts, + batched=True, + num_proc=num_proc, + desc="Grouping texts", + ) + + return preprocessed + +def preprocess_data_module_wikitext2( + raw_dataset_dict, + tokenizer, + max_length, + num_proc: int, +) -> datasets.DatasetDict: + if tokenizer.pad_token in ["", None]: + tokenizer.pad_token = tokenizer.eos_token + + def tokenize_function(examples): + return tokenizer(["\n\n".join(examples["text"])]) + + encodings = raw_dataset_dict.map( + tokenize_function, + batched=True, + remove_columns=raw_dataset_dict["train"].column_names, + desc="Running tokenizer on dataset", + num_proc=num_proc, + ) + + def group_texts(examples): + # Concatenate all texts. + # >>> sum([[1,2,3],[4,5,6]],[]) + # [1, 2, 3, 4, 5, 6] + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= max_length: + total_length = (total_length // max_length) * max_length + # Split by chunks of block_size. + result = { + k: [t[i : i + max_length] for i in range(0, total_length, max_length)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + preprocessed = encodings.map( + group_texts, + batched=True, + num_proc=num_proc, + desc="Grouping texts", + ) + + return preprocessed + +def preprocess_data_module_slim_pajama_6b( + raw_data_module, + tokenizer, + max_length, + num_proc: int, +) -> datasets.DatasetDict: + if tokenizer.pad_token in ["", None]: + tokenizer.pad_token = tokenizer.eos_token + + def tokenize_function(examples): + return tokenizer(["\n\n".join(examples["text"])]) + + encodings = raw_data_module.map( + tokenize_function, + batched=True, + remove_columns=raw_data_module["train"].column_names, + desc="Running tokenizer on dataset", + num_proc=num_proc, + ) + + def group_texts(examples): + # Concatenate all texts. + # >>> sum([[1,2,3],[4,5,6]],[]) + # [1, 2, 3, 4, 5, 6] + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= max_length: + total_length = (total_length // max_length) * max_length + # Split by chunks of block_size. + result = { + k: [t[i : i + max_length] for i in range(0, total_length, max_length)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + preprocessed = encodings.map( + group_texts, + batched=True, + num_proc=num_proc, + desc="Grouping texts", + ) + + return preprocessed + +if __name__ == "__main__": + args = cli() + + # setup huggingface cache if specified + if args.huggingface_cache: + os.environ["HF_HOME"] = args.huggingface_cache + + + # Load and preprocess dataset + if args.datasets == "alpaca": + dataset_dict = datasets.load_dataset("tatsu-lab/alpaca") + + if args.datasets == "wikitext": + dataset_dict = datasets.load_dataset("wikitext", "wikitext-2-raw-v1") + + if args.datasets == "slim-pajama": + dataset_list = datasets.load_dataset("DKYoon/SlimPajama-6B", split=['train[:1%]','test']) + dataset_dict = datasets.DatasetDict({"train": dataset_list[0], "test": dataset_list[1]}) + + if args.sweep: + # convert model_name to a list of models + model_list = args.model_name.split(",") + model_list = [model.strip() for model in model_list] + + for model_name in model_list: + print("Calculating perplexity for model: {}".format(model_name)) + perplexities = [] + + # initalize model and tokenizer + model = AutoModelForCausalLM.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) + # select padding token + tokenizer.pad_token = tokenizer.eos_token + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + # convert model to FP16 + model = model.half() + model.to(device) + model.eval() + batch_size = args.batch_size + data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + + if args.datasets == "alpaca": + preprocessed_dataset_dict = preprocess_data_module_open_alpaca(dataset_dict, tokenizer, 1024, 64) + train_dataset = preprocessed_dataset_dict["train"] + + if args.datasets == "wikitext": + preprocessed_dataset_dict = preprocess_data_module_wikitext2(dataset_dict, tokenizer, 1024, 64) + train_dataset = preprocessed_dataset_dict["test"] + + if args.datasets == "slim-pajama": + preprocessed_dataset_dict = preprocess_data_module_slim_pajama_6b(dataset_dict, tokenizer, 1024, 128) + train_dataset = preprocessed_dataset_dict["test"] + + if args.sample_size > 0: + np.random.seed(args.seed) + subsample_indices = np.random.choice(len(train_dataset), args.sample_size, replace=False) + train_dataset = train_dataset.select(subsample_indices) + + train_loader = DataLoader( + train_dataset, + batch_size=args.batch_size, + shuffle=False, + collate_fn=data_collator, + num_workers=8, + ) + + for i,batch in tqdm(enumerate(train_loader), total=len(train_loader),desc="Calculating perplexity: "): + inputs = batch["input_ids"].to(device) + attention_mask = batch["attention_mask"].to(device) + with torch.no_grad(): + outputs = model(inputs, labels=inputs, attention_mask=attention_mask) + loss = outputs.loss.item() + perplexity = np.exp(loss) + perplexities.append(perplexity) + + mean_perplexity = round(np.mean(perplexities), 2) + print("model: {}, mean perplexity: {}".format(model_name, mean_perplexity)) \ No newline at end of file diff --git a/llm-q-scaling-law-master/src/test_pippy.py b/llm-q-scaling-law-master/src/test_pippy.py new file mode 100644 index 0000000000000000000000000000000000000000..1625a8cd33d73a4348498617171325a022ada306 --- /dev/null +++ b/llm-q-scaling-law-master/src/test_pippy.py @@ -0,0 +1,140 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This examples requires torch=2.4.0 to run +tracing + attention in torch<2.4.0 is will failed due to RMSNorm not supported in tracing +""" + +import torch +import numpy as np +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling +from accelerate import PartialState, prepare_pippy +import datasets + +from llm_q_scaling_law.custom_tasks.open_alpaca import preprocess_data_module_open_alpaca, calculate_alpaca_ppl +from llm_q_scaling_law.custom_tasks.huggingface_evaluator import huggingface_dataset_process_limit + +model_name = "Qwen/Qwen1.5-7B" +limit = 100 +batch_size = 2 +# assuming first device = 0 for now +device = 0 + +# sdpa implementation which is the default torch>2.1.2 fails with the tracing + attention mask kwarg +model = AutoModelForCausalLM.from_pretrained( + model_name +) +# model.eval() + +# Input configs +# Create example inputs for the model +tokenizer = AutoTokenizer.from_pretrained(model_name) +tokenizer.pad_token = tokenizer.eos_token + +# preprocess data +dataset_dict = datasets.load_dataset("tatsu-lab/alpaca") +preprocessed_dataset_dict = preprocess_data_module_open_alpaca( + dataset_dict, tokenizer, 1024, 64 +) +train_dataset = preprocessed_dataset_dict["train"] +train_dataset = huggingface_dataset_process_limit(train_dataset, limit) + +# create data_loader +data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) +train_loader = torch.utils.data.DataLoader( + train_dataset, + batch_size=batch_size, + collate_fn=data_collator, + shuffle=False, + num_workers=8, +) + +# # use the first batch of input as example inputs +example_inputs = next(iter(train_loader)) +example_inputs = {k: v.to(device) for k, v in example_inputs.items() if k != "labels"} + +# prompts = ("I would like to", "I really like to", "The weather is pretty") # bs = 3 +# example_inputs = tokenizer(prompts, return_tensors="pt", padding=True) +# example_inputs = example_inputs.to(device) + +# if PartialState().is_last_process: +# print("Example inputs: ", example_inputs) + +# Create a pipeline stage from the model +# Using `auto` is equivalent to letting `device_map="auto"` figure +# out device mapping and will also split the model according to the +# number of total GPUs available if it fits on one GPU +model = prepare_pippy(model, split_points="auto", example_kwargs=example_inputs) + +if PartialState().is_last_process: + print("Model prepared") + +with torch.no_grad(): + output = model(**example_inputs) + +# if PartialState().is_last_process: +# print("Output: ", output) + +# You can pass `gather_output=True` to have the output from the model +# available on all GPUs +# model = prepare_pippy(model, split_points="auto", example_args=(input,), gather_output=True) + +if PartialState().is_last_process: + print("Start Running Evaluator") + +# start running the evaluator +perplexities = [] + +for i,batch in tqdm(enumerate(train_loader), total=len(train_loader),desc="Calculating perplexity: "): + # drop the labels in the batch + inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"} + + # input_ids = inputs["input_ids"] + # attention_mask = inputs["attention_mask"] + # labels = batch["labels"].to(device) + + with torch.no_grad(): + # create a single input with outputs = model(**inputs) + outputs = model(**inputs) + # outputs = model(input_ids, attention_mask=attention_mask, labels=labels) + + # if PartialState().is_last_process: + # breakpoint() + + if PartialState().is_last_process: + # only calculate perplexity on the last process + output_device = outputs[0].device + labels = batch["labels"].to(output_device) + + logits = outputs[0][..., :-1, :].contiguous() + labels = labels[..., 1:].contiguous() + + flat_logits = logits.view(-1, logits.size(-1)) + flat_labels = labels.view(-1) + + loss = torch.nn.functional.cross_entropy(flat_logits, flat_labels, reduction="mean") + perplexity = torch.exp(loss) + + perplexities.append(perplexity.item()) + + +# The outputs are only on the final process by default +if PartialState().is_last_process: + # ppl = calculate_alpaca_ppl(result_dict) + # print(ppl) + ppl = np.mean(perplexities) + print("Perplexity: ", ppl) + print("Done") \ No newline at end of file diff --git a/llm-q-scaling-law-master/src/test_quantize.py b/llm-q-scaling-law-master/src/test_quantize.py new file mode 100644 index 0000000000000000000000000000000000000000..a48101dab7017b65cb133f32ca0a478d56328ad4 --- /dev/null +++ b/llm-q-scaling-law-master/src/test_quantize.py @@ -0,0 +1,68 @@ +from copy import deepcopy +import torch +from transformers import AutoModelForCausalLM, AutoConfig +from llm_q_scaling_law.models import quantize_transformer + + +@torch.no_grad() +def test_quantize_opt(): + model_name = "facebook/opt-125m" + q_config = {"w": {"name": "bypass"}, "x": {"name": "bypass"}} + model_ref = AutoModelForCausalLM.from_pretrained(model_name) + model_ref.eval() + model_q, _ = quantize_transformer( + deepcopy(model_ref), q_config, op_ids=[0, 1, 2, 3, 4], granularity="decoder_layer" + ) + model_q.eval() + + x = torch.randint(0, 1245, (1, 10)).cuda() + model_ref.cuda() + model_q.cuda() + y_ref = model_ref(x, labels=x).logits + y_q = model_q(x, labels=x).logits + + print(model_q) + print((y_ref - y_q).abs().max()) + assert torch.all(y_ref == y_q) + + q_config = { + "linear": { + "x": { + "name": "mxint", + "width": 8, + "block_size": 16, + "block_axis": -2, + }, + "w": { + "name": "mxint", + "width": 8, + "block_size": 16, + "block_axis": -2, + }, + }, + "matmul": { + "x": { + "name": "mxint", + "width": 8, + "block_size": 16, + "block_axis": -2, + }, + "w": { + "name": "mxint", + "width": 8, + "block_size": 16, + "block_axis": -1, + }, + }, + } + model_q, _ = quantize_transformer( + deepcopy(model_ref), q_config, op_ids=[0, 1, 2, 3, 4], granularity="decoder_layer" + ) + model_q.eval() + + y_q = model_q(x, labels=x).logits + print((y_ref - y_q).abs().max()) + + +if __name__ == "__main__": + test_quantize_opt() diff --git a/llm-q-scaling-law-master/src/vllm_calculate_output_perplexity.py b/llm-q-scaling-law-master/src/vllm_calculate_output_perplexity.py new file mode 100644 index 0000000000000000000000000000000000000000..d667cd86529c64df61f5c5d63cafe1c6c7538247 --- /dev/null +++ b/llm-q-scaling-law-master/src/vllm_calculate_output_perplexity.py @@ -0,0 +1,118 @@ +from vllm import LLM, SamplingParams +import numpy as np +import torch +import datasets +import argparse +import os + +# this requires vllm to run, install vllm and huggingface datasets for this + +ALPACA_NON_EMPTY_PROMPTS = """ +Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. + +### Instruction: +{instruction} + +### Input: +{input} + +### Response: +""" + +ALPACA_EMPTY_PROMPTS = """ +Below is an instruction that describes a task. Write a response that appropriately completes the request. + +### Instruction: +{instruction} + +### Response: +""" + +def cli(): + parser = argparse.ArgumentParser() + parser.add_argument("--huggingface_cache", type=str) # use specific huggingface cache if specified + parser.add_argument("--model_name", type=str, required=True) # huggingface model name + parser.add_argument("--datasets", type=str, default="alpaca") # huggingface datasets name + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--tensor_parallel", type=int, default=1) + parser.add_argument("--sample_size", type=int, default=0) + return parser.parse_args() + +if __name__ == "__main__": + args = cli() + + # setup huggingface cache if specified + if args.huggingface_cache: + os.environ["HF_HOME"] = args.huggingface_cache + + # Load model + if args.tensor_parallel > 1: + model = LLM(model=args.model_name, tensor_parallel_size=args.tensor_parallel) + else: + model = LLM(model=args.model_name) + + # Load and preprocess dataset + # load alpaca dataset + if args.datasets == "alpaca": + datas = datasets.load_dataset("yahma/alpaca-cleaned") + datas = datas["train"] + prompts = [] + outputs = [] + for data in datas: + if data["input"] == "": + prompts.append(ALPACA_EMPTY_PROMPTS.format(instruction=data["instruction"])) + outputs.append(data["output"]) + else: + prompts.append(ALPACA_NON_EMPTY_PROMPTS.format(instruction=data["instruction"], input=data["input"])) + outputs.append(data["output"]) + + # truncate prompts and outptus to 512 to increase speed + prompts = [p[:512] for p in prompts] + outputs = [o[:512] for o in outputs] + + if args.sample_size > 0: + # randomly sample a subset of the data with the specified size + np.random.seed(args.seed) + indices = np.random.choice(len(prompts), args.sample_size, replace=False) + prompts = [prompts[i] for i in indices] + outputs = [outputs[i] for i in indices] + + # prompts = prompts[:100] # for testing + + sampling_params = SamplingParams(temperature=1.0, logprobs=True, max_tokens=512) # truncate prompts and outptus to 512 to increase speed + + generated_outputs = model.generate(prompts, sampling_params=sampling_params) + + generated_texts = [] + generated_cumulative_logprobs = [] + perplexities = [] + + for generated_output in generated_outputs: + generated_texts.append(generated_output.outputs[0].text) + generated_cumulative_logprobs.append(generated_output.outputs[0].cumulative_logprob) + + # we calculate the perplexity as exp(loss) + # here the loss is NLL loss, so we calculate perplexity as exp(-NLL) + # NLL is calculated as -1/N * sum(log(probabilities)) + + perplexity = np.exp(-generated_output.outputs[0].cumulative_logprob/len(generated_output.outputs[0].logprobs)) + perplexities.append(perplexity) + + mean_perplexity = round(np.mean(perplexities), 2) + print("model: {}, mean perplexity: {}".format(args.model_name, mean_perplexity)) + + # generate an output file loggin the perplexities + model_name = args.model_name.split("/")[-1] + file_name = f"perplexities_{model_name}.txt" + + # create the file if it does not exist under the result folder + if not os.path.exists("result"): + os.makedirs("result") + file_name = os.path.join("result", file_name) + # create the file if it does not exist + if not os.path.exists(file_name): + with open(file_name, "w") as f: + f.write("model: {}\n".format(args.model_name)) + + with open(file_name, "a") as f: + f.write("seed: {}, perplexity: {}\n".format(args.seed, mean_perplexity)) \ No newline at end of file