add int4/int8

Files changed (10) hide show

.gitignore +1 -0
README.md +1 -1
__pycache__/onnx_gemma3_pipeline.cpython-312.pyc +0 -0
download_missing_hf_files.py +33 -0
embeddinggemma-300m/model.onnx +2 -2
embeddinggemma-300m/onnx/model_int4.onnx +3 -0
embeddinggemma-300m/onnx/model_int8.onnx +3 -0
float16.py +878 -0
quantize_extended.py +189 -0
utils.py +67 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

README.md CHANGED Viewed

@@ -33,7 +33,7 @@ This demonstrates how ONNX conversion can offload more computation for faster, h
 	 ```
 2. Export the ONNX model:
 	 ```sh
-	 optimum-cli export onnx --model google/embeddinggemma-300m-qat-q4_0-unquantized embeddinggemma-300m-onnx
 	 python download_missing_hf_files.py
 	 ```

 	 ```
 2. Export the ONNX model:
 	 ```sh
+	 optimum-cli export onnx --model google/embeddinggemma-300m-qat-q4_0-unquantized --optimize O3 --slim embeddinggemma-300m-onnx
 	 python download_missing_hf_files.py
 	 ```

__pycache__/onnx_gemma3_pipeline.cpython-312.pyc DELETED Viewed

Binary file (6.63 kB)

download_missing_hf_files.py CHANGED Viewed

@@ -67,3 +67,36 @@ torch.onnx.export(
     opset_version=14
 )
 print("Exported dense2.onnx")

     opset_version=14
 )
 print("Exported dense2.onnx")
+# # Quantize dense1.onnx and dense2.onnx to int4 using ONNX Runtime matmul_4bits_quantizer
+# from onnxruntime.quantization import (
+#     matmul_nbits_quantizer,
+#     quant_utils
+# )
+# from pathlib import Path
+# onnx_dir = Path(onnx_dir)
+# for dense_name in ["dense1.onnx", "dense2.onnx"]:
+#     model_fp32_path = onnx_dir / dense_name
+#     model_int4_path = model_fp32_path  # Overwrite original file
+#     quant_config = matmul_nbits_quantizer.DefaultWeightOnlyQuantConfig(
+#         block_size=128,
+#         is_symmetric=True,
+#         accuracy_level=4,
+#         quant_format=quant_utils.QuantFormat.QOperator,
+#         op_types_to_quantize=("MatMul", "Gather"),
+#         quant_axes=( ("MatMul", 0), ("Gather", 1) )
+#     )
+#     model = quant_utils.load_model_with_shape_infer(model_fp32_path)
+#     quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
+#         model,
+#         nodes_to_exclude=None,
+#         nodes_to_include=None,
+#         algo_config=quant_config,
+#     )
+#     quant.process()
+#     quant.model.save_model_to_file(
+#         str(model_int4_path),
+#         True
+#     )
+#     print(f"Quantized {dense_name} to int4 and overwrote original file.")

embeddinggemma-300m/model.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dee985629c11dd0f70531093aeb8e8f7f5ddfb403f6c2705db340d58e4e03ffb
-size 1212541436

 version https://git-lfs.github.com/spec/v1
+oid sha256:511afd7b7ed2b58a61876a6aef4c1113a93b2afad17cf363753eef48f4669a41
+size 1212258625

embeddinggemma-300m/onnx/model_int4.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f79136a20219163cdf5701a85422ccde126db44d2d51f6bcfc07b63edc0efab9
+size 869894596

embeddinggemma-300m/onnx/model_int8.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c74d91f9c44be1b3a7cbe5875d61597b493ae2789e7a62f540b351eaf0cdc57
+size 265916124

float16.py ADDED Viewed

	@@ -0,0 +1,878 @@

+# MIT License
+#
+# Copyright (c) Microsoft Corporation, Hugging Face. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from typing import Optional
+import itertools
+import numpy as np
+import onnx
+import packaging.version as pv
+import warnings
+from onnx import helper, numpy_helper
+from onnx import onnx_pb as onnx_proto
+import onnxslim.third_party.onnx_graphsurgeon as gs
+FLOAT32 = 1
+FLOAT16 = 10
+def _npfloat16_to_int(np_list):
+    """
+    Convert numpy float16 to python int.
+    :param np_list: numpy float16 list
+    :return int_list: python int list
+    """
+    return [int(bin(_.view("H"))[2:].zfill(16), 2) for _ in np_list]
+def convert_np_to_float16(np_array, min_positive_val=1e-7, max_finite_val=1e4):
+    """
+    Convert float32 numpy array to float16 without changing sign or finiteness.
+    Positive values less than min_positive_val are mapped to min_positive_val.
+    Positive finite values greater than max_finite_val are mapped to max_finite_val.
+    Similar for negative values. NaN, 0, inf, and -inf are unchanged.
+    """
+    def between(a, b, c):
+        return np.logical_and(a < b, b < c)
+    positive_values = np_array[np.where(np_array > 0)]
+    if positive_values.shape[0] > 0:
+        pos_max = positive_values.max()
+        pos_min = positive_values.min()
+        if pos_max >= max_finite_val:
+            warnings.warn(
+                "the float32 number {} will be truncated to {}".format(
+                    pos_max, max_finite_val
+                )
+            )
+        if pos_min <= min_positive_val:
+            warnings.warn(
+                "the float32 number {} will be truncated to {}".format(
+                    pos_min, min_positive_val
+                )
+            )
+    negative_values = np_array[np.where(np_array < 0)]
+    if negative_values.shape[0] > 0:
+        neg_max = negative_values.max()
+        neg_min = negative_values.min()
+        if neg_min <= -max_finite_val:
+            warnings.warn(
+                "the float32 number {} will be truncated to {}".format(
+                    neg_min, -max_finite_val
+                )
+            )
+        if neg_max >= -min_positive_val:
+            warnings.warn(
+                "the float32 number {} will be truncated to {}".format(
+                    neg_max, -min_positive_val
+                )
+            )
+    np_array = np.where(
+        between(0, np_array, min_positive_val), min_positive_val, np_array
+    )
+    np_array = np.where(
+        between(-min_positive_val, np_array, 0), -min_positive_val, np_array
+    )
+    np_array = np.where(
+        between(max_finite_val, np_array, float("inf")), max_finite_val, np_array
+    )
+    np_array = np.where(
+        between(float("-inf"), np_array, -max_finite_val), -max_finite_val, np_array
+    )
+    return np.float16(np_array)
+def convert_tensor_float_to_float16(tensor, min_positive_val=1e-7, max_finite_val=1e4):
+    """
+    Convert tensor float to float16.
+    :param tensor: TensorProto object
+    :return tensor_float16: converted TensorProto object
+    """
+    if not isinstance(tensor, onnx_proto.TensorProto):
+        raise ValueError(
+            "Expected input type is an ONNX TensorProto but got %s" % type(tensor)
+        )
+    if tensor.data_type == onnx_proto.TensorProto.FLOAT:
+        tensor.data_type = onnx_proto.TensorProto.FLOAT16
+        # convert float_data (float type) to float16 and write to int32_data
+        if tensor.float_data:
+            float16_data = convert_np_to_float16(
+                np.array(tensor.float_data), min_positive_val, max_finite_val
+            )
+            int_list = _npfloat16_to_int(float16_data)
+            tensor.int32_data[:] = int_list
+            tensor.float_data[:] = []
+        # convert raw_data (bytes type)
+        if tensor.raw_data:
+            # convert n.raw_data to float
+            float32_list = np.fromstring(tensor.raw_data, dtype="float32")
+            # convert float to float16
+            float16_list = convert_np_to_float16(
+                float32_list, min_positive_val, max_finite_val
+            )
+            # convert float16 to bytes and write back to raw_data
+            tensor.raw_data = float16_list.tostring()
+    return tensor
+def make_value_info_from_tensor(tensor):
+    shape = numpy_helper.to_array(tensor).shape
+    return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)
+DEFAULT_OP_BLOCK_LIST = [
+    "ArrayFeatureExtractor",
+    "Binarizer",
+    "CastMap",
+    "CategoryMapper",
+    "DictVectorizer",
+    "FeatureVectorizer",
+    "Imputer",
+    "LabelEncoder",
+    "LinearClassifier",
+    "LinearRegressor",
+    "Normalizer",
+    "OneHotEncoder",
+    "RandomUniformLike",
+    "SVMClassifier",
+    "SVMRegressor",
+    "Scaler",
+    "TreeEnsembleClassifier",
+    "TreeEnsembleRegressor",
+    "ZipMap",
+    "NonMaxSuppression",
+    "TopK",
+    "RoiAlign",
+    "Resize",
+    # 'Range',
+    "CumSum",
+    "Min",
+    "Max",
+    "Upsample",
+    # NEW:
+    "RandomNormalLike",
+    # TODO: Ideally, "Cast" nodes should not be here, for the following reasons:
+    #  - It breaks the semantics that the default list contains "ops that are not supported for float16 in ONNX Runtime".
+    #  - When fp32 casts already exist in the model (e.g., for rotary embeddings), this script will insert redundant casts around it.
+    # However, without it, the graphs produced are invalid. Eventually, we will resolve this.
+    "Cast",
+]
+def initial_checking(model, disable_shape_infer):
+    func_infer_shape = None
+    if not disable_shape_infer and pv.Version(onnx.__version__) >= pv.Version("1.2"):
+        try:
+            from onnx.shape_inference import infer_shapes
+            func_infer_shape = infer_shapes
+        finally:
+            pass
+    if not isinstance(model, onnx_proto.ModelProto):
+        raise ValueError(
+            "Expected model type is an ONNX ModelProto but got %s" % type(model)
+        )
+    if func_infer_shape is not None:
+        model = func_infer_shape(model)
+    is_fp16_ready_flag = check_if_fp16_ready(model.graph)
+    return model, func_infer_shape, is_fp16_ready_flag
+def convert_float_to_float16(
+    model,
+    min_positive_val=1e-7,
+    max_finite_val=1e4,
+    keep_io_types=False,
+    disable_shape_infer=False,
+    op_block_list=None,
+    node_block_list=None,
+    check_fp16_ready=True,
+):
+    # create blocklists
+    if op_block_list is None:
+        op_block_list = DEFAULT_OP_BLOCK_LIST
+    if node_block_list is None:
+        node_block_list = []
+    op_block_list = set(op_block_list)
+    node_block_list = set(node_block_list)
+    global_input_name_dict = (
+        {}
+    )  # key: input name, value: new output name after Cast node
+    # basic checking, including shape inference
+    model, func_infer_shape, is_fp16_ready_flag = initial_checking(
+        model, disable_shape_infer
+    )
+    if is_fp16_ready_flag and check_fp16_ready:
+        raise ValueError(
+            "The model is already converted to float16, if convert again, the model might be wrong. \n If you are sure to convert again, please set check_fp16_ready=False."
+        )
+    graph_stack = [model.graph]
+    is_top_level = True
+    while graph_stack:
+        next_level = []
+        for curr_graph in graph_stack:
+            process_graph_input(
+                curr_graph, is_top_level, keep_io_types, global_input_name_dict
+            )
+            value_info_block_list = process_tensor_in_node(
+                curr_graph,
+                op_block_list,
+                node_block_list,
+                min_positive_val,
+                max_finite_val,
+            )
+            process_value_info(curr_graph, value_info_block_list)
+            process_node_in_block_list(
+                curr_graph, global_input_name_dict, op_block_list, node_block_list
+            )
+            process_initializers(
+                curr_graph,
+                op_block_list,
+                node_block_list,
+                min_positive_val,
+                max_finite_val,
+            )
+            process_graph_output(curr_graph, is_top_level, keep_io_types)
+            sub_graph_list = get_next_level_graph(
+                curr_graph, op_block_list, node_block_list
+            )
+            if len(sub_graph_list) > 0:
+                next_level.extend(sub_graph_list)
+            if not is_top_level:
+                process_node_input_output(curr_graph, global_input_name_dict)
+            is_top_level = False  # Going to process sub-graph
+        graph_stack = next_level
+    remove_unnecessary_cast_node(model.graph)
+    # Topologically sort the graph
+    # NOTE: We do not perform another round of optimization as the model is already optimized
+    graph = gs.import_onnx(model)
+    graph.toposort()
+    model = gs.export_onnx(graph)
+    return model
+# Change the input/output of the node to the new output name after Cast node for sub-graph
+# Because there have NO value_info start from
+def process_node_input_output(
+    graph: onnx_proto.GraphProto, global_input_name_dict: dict
+):
+    for node in graph.node:
+        for i, input_name in enumerate(node.input):
+            if input_name in global_input_name_dict:
+                node.input[i] = global_input_name_dict[input_name]
+        for i, output_name in enumerate(node.output):
+            if output_name in global_input_name_dict:
+                node.output[i] = global_input_name_dict[output_name]
+def process_graph_input(
+    graph: onnx_proto.GraphProto,
+    is_top_level: bool,
+    is_io_fp32: bool,
+    global_input_name_dict: dict,
+):
+    # The input dtype is float32, need to cast to fp16
+    if is_top_level and is_io_fp32:
+        for graph_input in graph.input:  # n_input is ValueInfoProto
+            if graph_input.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
+                downstream_nodes = find_downstream_node_by_input_name(
+                    graph, graph_input.name
+                )
+                for d_node in downstream_nodes:
+                    # More than one node may consume the model input, so we only create
+                    # a single cast node, and then reuse this node when needed.
+                    cast_exists = graph_input.name in global_input_name_dict
+                    if cast_exists:
+                        cast_node_output_name = global_input_name_dict[graph_input.name]
+                    else:
+                        cast_node_output_name = graph_input.name + "_fp16"
+                        add_cast_node(
+                            graph,
+                            [graph_input.name],
+                            [cast_node_output_name],
+                            cast_node_output_name,  # Set node name same as output name
+                            FLOAT16,
+                        )
+                        add_new_value_info(
+                            graph,
+                            graph_input,
+                            cast_node_output_name,
+                            onnx_proto.TensorProto.FLOAT16,
+                        )
+                    for i, input_name in enumerate(d_node.input):
+                        if input_name == graph_input.name:
+                            d_node.input[i] = (
+                                cast_node_output_name  # Change the input of the second node
+                            )
+                            global_input_name_dict[graph_input.name] = (
+                                cast_node_output_name
+                            )
+    # For the sub-graph, don't do cast
+    else:  # Change the input dtype to fp16 without any cast
+        for graph_input in graph.input:
+            if graph_input.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
+                graph_input.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+def process_graph_output(
+    graph: onnx_proto.GraphProto, is_top_level: bool, is_io_fp32: bool
+):
+    if is_top_level and is_io_fp32:  # the output dtype is float32, need to cast to fp16
+        for i, graph_output in enumerate(graph.output):
+            if graph_output.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
+                new_producer_name = graph_output.name + "_fp16"
+                original_name = graph_output.name  # The correct output name
+                # Get the node(s) that produce the model output
+                # These will most likely be fp16, but could be fp32 if the previous node is in block_list
+                upstream_nodes = find_upstream_node_by_output_name(graph, original_name)
+                assert len(upstream_nodes) == 1  # Should be only one node
+                producer_node = upstream_nodes[0]
+                for i, output_name in enumerate(producer_node.output):
+                    if output_name == original_name:
+                        producer_node.output[i] = new_producer_name
+                cast_node_name = new_producer_name + "_input_cast" + str(i)
+                add_cast_node(
+                    graph,
+                    [new_producer_name],
+                    [original_name],
+                    cast_node_name,
+                    onnx_proto.TensorProto.FLOAT,
+                )
+                for value_info in graph.value_info:
+                    if original_name == value_info.name:
+                        value_info.type.tensor_type.elem_type = (
+                            onnx_proto.TensorProto.FLOAT
+                        )
+                # Get the node(s) that consume the model output
+                downstream_nodes = find_downstream_node_by_input_name(
+                    graph,
+                    original_name,
+                    include_subgraphs=False,
+                )
+                # It is possible that the producer node is also input to downstream nodes
+                # So, we update the inputs of these downstream nodes
+                for d_node in downstream_nodes:
+                    for i, input_name in enumerate(d_node.input):
+                        if input_name == original_name:
+                            d_node.input[i] = new_producer_name
+    else:  # change the output dtype to fp16 in tensor
+        for graph_output in graph.output:
+            if graph_output.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
+                graph_output.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+def process_node_in_block_list(
+    graph: onnx_proto.GraphProto,
+    global_input_name_dict: dict,
+    op_block_list: list,
+    node_block_list: list,
+):
+    # NB: Important to create a copy of the nodes in the graph to avoid modifying
+    # the graph in-place while iterating (causing an infinite loop)
+    for node in list(graph.node):
+        if (node.op_type in op_block_list) or (node.name in node_block_list):
+            insert_cast32_before_node(graph, node, global_input_name_dict)
+            insert_cast16_after_node(graph, node, global_input_name_dict)
+# Todo: global_input_name_dict still not fill value
+def insert_cast32_before_node(
+    graph: onnx_proto.GraphProto, node: onnx_proto.NodeProto, global_input_name_dict
+):
+    for i, input_name in enumerate(node.input):
+        for value_info in itertools.chain(graph.value_info, graph.input):
+            if input_name == value_info.name:
+                if (
+                    value_info.type.tensor_type.elem_type
+                    != onnx_proto.TensorProto.FLOAT16
+                ):
+                    break
+                cast_output_name = node.name + "_input_cast_" + str(i)
+                add_new_value_info(
+                    graph, value_info, cast_output_name, onnx_proto.TensorProto.FLOAT
+                )
+                cast_node_name = node.name + "_input_cast" + str(i)
+                add_cast_node(
+                    graph,
+                    [input_name],
+                    [cast_output_name],
+                    cast_node_name,
+                    onnx_proto.TensorProto.FLOAT,
+                )
+                node.input[i] = cast_output_name
+                break
+# Todo: global_input_name_dict still not fill value
+def insert_cast16_after_node(
+    graph: onnx_proto.GraphProto, node: onnx_proto.NodeProto, global_input_name_dict
+):
+    for i, output_name in enumerate(node.output):
+        for value_info in itertools.chain(graph.value_info, graph.output):
+            if output_name == value_info.name:
+                if (
+                    value_info.type.tensor_type.elem_type
+                    != onnx_proto.TensorProto.FLOAT
+                ):
+                    break
+                cast_input_name = node.name + "_output_cast_" + str(i)
+                add_new_value_info(
+                    graph, value_info, cast_input_name, onnx_proto.TensorProto.FLOAT
+                )
+                value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+                cast_node_name = node.name + "_output_cast" + str(i)
+                add_cast_node(
+                    graph,
+                    [cast_input_name],
+                    [output_name],
+                    cast_node_name,
+                    onnx_proto.TensorProto.FLOAT16,
+                )
+                node.output[i] = cast_input_name
+                break
+# Process tensor data in attribute of the node
+def process_tensor_in_node(
+    graph: onnx_proto.GraphProto,
+    op_block_list: list,
+    node_block_list: list,
+    min_positive_val,
+    max_finite_val,
+):
+    value_info_block_list = set()  # This is for later use, not in this step
+    for node in graph.node:
+        # NOTE: "Cast" operation cannot change its output type because it is strongly typed.
+        if (
+            (node.op_type in op_block_list)
+            or (node.name in node_block_list)
+            or (node.op_type == "Cast")
+        ):
+            # if (node.op_type in op_block_list) or (node.name in node_block_list):
+            # Only need to block the output value_info changing
+            for output_name in node.output:
+                value_info_block_list.add(output_name)
+        else:
+            for attr in node.attribute:
+                # one tensor
+                if attr.t.data_type == onnx_proto.TensorProto.FLOAT:
+                    attr.t.CopyFrom(
+                        convert_tensor_float_to_float16(
+                            attr.t, min_positive_val, max_finite_val
+                        )
+                    )
+                # list of tensor
+                for t in attr.tensors:
+                    if t.data_type == onnx_proto.TensorProto.FLOAT:
+                        t.CopyFrom(
+                            convert_tensor_float_to_float16(
+                                t, min_positive_val, max_finite_val
+                            )
+                        )
+    return value_info_block_list
+# Change all the value info type from float32 to float16 if not in block list
+def process_value_info(graph: onnx_proto.GraphProto, value_info_block_list: list):
+    for value_info in graph.value_info:
+        if value_info.name in value_info_block_list:
+            continue
+        else:
+            if value_info.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
+                value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+# Initializer is 'edge' type, so doesn't have value_info
+def process_initializers(
+    graph: onnx_proto.GraphProto,
+    op_block_list,
+    node_block_list,
+    min_positive_val,
+    max_finite_val,
+):
+    # Find the input of the block node, don't need to change this kind of initializer
+    initializer_block_list = set()
+    for node in graph.node:
+        if (node.op_type in op_block_list) or (node.name in node_block_list):
+            for (
+                input_name
+            ) in (
+                node.input
+            ):  # some is initializer, some is value_info, can't distinguish but doesn't matter
+                initializer_block_list.add(input_name)
+    # Process initializers
+    for initializer in graph.initializer:
+        if initializer.name not in initializer_block_list:
+            if initializer.data_type == onnx_proto.TensorProto.FLOAT:
+                convert_tensor_float_to_float16(
+                    initializer, min_positive_val, max_finite_val
+                )
+def get_next_level_graph(
+    graph: onnx_proto.GraphProto, op_block_list: list, node_block_list: list
+):
+    sub_graph_list = []
+    for node in graph.node:
+        if node.op_type in op_block_list or node.name in node_block_list:
+            continue
+        for attr in node.attribute:
+            # Check if sub-graph exist
+            if len(attr.g.node) > 0:  # single sub-graph
+                sub_graph_list.append(attr.g)
+            for g in attr.graphs:
+                if len(g.node) > 0:  # multiple sub-graphs
+                    sub_graph_list.append(g)
+    return sub_graph_list
+def add_cast_node(
+    graph: onnx_proto.GraphProto,
+    inputs: list,
+    outputs: list,
+    node_name: str,
+    to_type: int,
+):
+    new_node = [helper.make_node("Cast", inputs, outputs, to=to_type, name=node_name)]
+    graph.node.extend(new_node)
+def add_new_value_info(
+    graph: onnx_proto.GraphProto,
+    exist_value_info: onnx_proto.ValueInfoProto,
+    name: str,
+    dtype: int,
+):
+    new_value_info = graph.value_info.add()
+    new_value_info.CopyFrom(exist_value_info)
+    new_value_info.name = name
+    new_value_info.type.tensor_type.elem_type = dtype
+# Find the node that has the specified output name
+def find_upstream_node_by_output_name(graph: onnx_proto.GraphProto, output_name: str):
+    nodes = []
+    for node in graph.node:
+        if output_name in node.output:
+            nodes.append(node)
+    assert len(nodes) <= 1  # Suppose there is less than one node found
+    return nodes
+# Find the node that has the specified input name, including in subgraphs
+def find_downstream_node_by_input_name(
+    graph: onnx_proto.GraphProto, input_name: str, include_subgraphs=True
+):
+    nodes = []
+    # Check nodes in current graph
+    for node in graph.node:
+        if input_name in node.input:
+            nodes.append(node)
+        if not include_subgraphs:
+            continue
+        # Recursively check subgraphs in node attributes
+        for attr in node.attribute:
+            if attr.type == onnx_proto.AttributeProto.GRAPH:
+                # Single subgraph
+                if len(attr.g.node) > 0:
+                    nodes.extend(find_downstream_node_by_input_name(attr.g, input_name))
+            # Multiple subgraphs
+            if attr.type == onnx_proto.AttributeProto.GRAPHS:
+                for g in attr.graphs:
+                    if len(g.node) > 0:
+                        nodes.extend(find_downstream_node_by_input_name(g, input_name))
+    return nodes
+# Remove identity node
+def remove_identity_node_from_model(model: onnx_proto.ModelProto):
+    remove_identity_node_from_graph(model.graph)
+    try:
+        from onnx.shape_inference import infer_shapes
+        func_infer_shape = infer_shapes
+        model = func_infer_shape(model)
+        return model
+    finally:
+        pass
+# Remove identity node
+def remove_identity_node_from_graph(graph: onnx_proto.GraphProto):
+    for curr_node in graph.node:
+        if curr_node.op_type == "Identity":
+            for input_name in curr_node.input:
+                upstream_nodes = find_upstream_node_by_output_name(graph, input_name)
+                for u_node in upstream_nodes:
+                    if u_node is not None:
+                        u_node.output[0] = curr_node.output[0]
+                        graph.node.remove(curr_node)
+def convert_float_to_float16_model_path(
+    model_path, min_positive_val=1e-7, max_finite_val=1e4, keep_io_types=False
+):
+    """
+    Convert tensor float type in the ONNX Model to tensor float16.
+    *It is to fix an issue that infer_shapes func cannot be used to infer >2GB models.
+    *But this function can be applied to all model sizes.
+    :param model_path: ONNX Model path
+    :return: converted ONNX ModelProto object
+    Examples
+    ::
+        #Convert to ONNX ModelProto object and save model binary file:
+        from onnxmltools.utils.float16_converter import convert_float_to_float16_model_path
+        new_onnx_model = convert_float_to_float16_model_path('model.onnx')
+        onnx.save(new_onnx_model, 'new_model.onnx')
+    """
+    disable_shape_infer = False
+    if pv.Version(onnx.__version__) >= pv.Version("1.8"):
+        try:
+            # infer_shapes_path can be applied to all model sizes
+            from onnx.shape_inference import infer_shapes_path
+            import tempfile
+            import os
+            # shape_infer_model_path should be in the same folder of model_path
+            with tempfile.NamedTemporaryFile(
+                dir=os.path.dirname(model_path)
+            ) as tmpfile:
+                shape_infer_model_path = tmpfile.name
+                infer_shapes_path(model_path, shape_infer_model_path)
+                model = onnx.load(shape_infer_model_path)
+                disable_shape_infer = True
+        finally:
+            pass
+    if not disable_shape_infer:
+        model = onnx.load(model_path)
+    return convert_float_to_float16(
+        model, min_positive_val, max_finite_val, keep_io_types, disable_shape_infer
+    )
+def remove_unnecessary_cast_node(graph_proto: onnx_proto.GraphProto):
+    # 1. find all cast nodes in the graph
+    cast_node_list = []
+    input_name_to_cast_node_dict = {}
+    output_name_to_cast_node_dict = {}
+    # using name as key to point to a node. because node object cannot be key
+    name_to_node_dict = {}
+    for node in graph_proto.node:
+        if node.op_type == "Cast":
+            # if node.name not in ["graph_input_cast0", "graph_output_cast0"]:
+            cast_node_list.append(node)
+            name_to_node_dict[node.name] = node
+            for input_name in node.input:
+                input_name_to_cast_node_dict[input_name] = node
+            for output_name in node.output:
+                output_name_to_cast_node_dict[output_name] = node
+    # 2. find upstream and downstream node of the cast node
+    cast_node_upstream_dict = {}  # mapping cast node(name) to its upstream node
+    cast_node_downstream_dict = {}  # mapping cast node(name) to its downstream node
+    for current_node in graph_proto.node:
+        # find the downstream node(s)
+        for input_name in current_node.input:
+            if input_name in output_name_to_cast_node_dict:
+                # found the downstream node of the cast node, might be multiple
+                cast_node = output_name_to_cast_node_dict[input_name]
+                if cast_node.name not in cast_node_downstream_dict:
+                    cast_node_downstream_dict[cast_node.name] = current_node
+                else:  # already exists one downstream node, make it a list
+                    existing_downstream_nodes = cast_node_downstream_dict[
+                        cast_node.name
+                    ]
+                    if isinstance(existing_downstream_nodes, list):
+                        existing_downstream_nodes.append(current_node)
+                    else:  # make a list
+                        existing_downstream_nodes = [
+                            existing_downstream_nodes,
+                            current_node,
+                        ]
+                        cast_node_downstream_dict[cast_node.name] = (
+                            existing_downstream_nodes
+                        )
+        # find the upstream node
+        for output_name in current_node.output:
+            if output_name in input_name_to_cast_node_dict:
+                # found the upstream node of the cast node, should be unique
+                cast_node = input_name_to_cast_node_dict[output_name]
+                cast_node_upstream_dict[cast_node.name] = current_node
+    # 3. remove the cast node which upstream is 'Constant'
+    for cast_node_name, upstream_node in cast_node_upstream_dict.items():
+        cast_node = name_to_node_dict[cast_node_name]
+        if upstream_node.op_type == "Constant":
+            cast_node_list.remove(cast_node)
+    # 4. find (cast_to_fp16, cast_to_fp32) pairs where --fp32--> cast_to_fp16 --fp16--> cast_to_fp32.
+    remove_candidate = []
+    name_to_value_info = {
+        value_info.name: value_info
+        for value_info in itertools.chain(graph_proto.value_info, graph_proto.input)
+    }
+    def get_type(name: str) -> Optional[int]:
+        if name in name_to_value_info:
+            return name_to_value_info[name].type
+        else:
+            # `name` has no value info.
+            return None
+    for cast_node_name, downstream_node in cast_node_downstream_dict.items():
+        cast_node = name_to_node_dict[cast_node_name]
+        if len(cast_node.input) != 1:
+            raise RuntimeError(
+                f"Cast node {cast_node_name} should have only one input, but has {len(cast_node.input)}."
+            )
+        input_type = get_type(cast_node.input[0])
+        if input_type != onnx_proto.TensorProto.FLOAT:
+            continue
+        if isinstance(downstream_node, list):
+            for dn in downstream_node:
+                if (
+                    dn.op_type == "Cast"
+                    and dn.attribute[0].i == 32
+                    and cast_node.attribute[0].i == 16
+                    and dn in cast_node_list
+                    and cast_node in cast_node_list
+                ):
+                    remove_candidate.append((cast_node, dn))
+        else:
+            if (
+                downstream_node.op_type == "Cast"
+                and cast_node.attribute[0].i == FLOAT16
+                and downstream_node.attribute[0].i == FLOAT32
+                and downstream_node in cast_node_list
+                and cast_node in cast_node_list
+            ):
+                remove_candidate.append((cast_node, downstream_node))
+    # 5. change "upstream --fp32--> cast_to_fp16 --fp16--> cast_to_fp32 --fp32--> downstream" to
+    # "upstream --fp32--> downstream".
+    for cast_node_pair in remove_candidate:
+        first_cast_node = cast_node_pair[0]
+        second_cast_node = cast_node_pair[1]
+        upstream_node = cast_node_upstream_dict.get(first_cast_node.name)
+        downstream_node = cast_node_downstream_dict.get(second_cast_node.name)
+        if upstream_node is None and downstream_node is not None:
+            # The upstream_node should be graph input
+            out = first_cast_node.input[0]
+            for i, input_name in enumerate(downstream_node.input):
+                for output_name in second_cast_node.output:
+                    if input_name == output_name:
+                        # change the input as the upstream node's output
+                        downstream_node.input[i] = out
+        elif upstream_node is not None and downstream_node is None:
+            raise ValueError(
+                "The downstream node of the second cast node should be graph output"
+            )
+        else:
+            # find the upstream node's output to first_cast_node
+            out = None
+            for output_name in upstream_node.output:
+                if output_name == first_cast_node.input[0]:
+                    out = output_name
+                    break
+            # find the downstream node's input as second_cast_node's output
+            for i, input_name in enumerate(downstream_node.input):
+                for output_name in second_cast_node.output:
+                    if input_name == output_name:
+                        # change the input as the upstream node's output
+                        downstream_node.input[i] = out
+    # 6. remove the cast node pair
+    for cast_node_pair in remove_candidate:
+        graph_proto.node.remove(cast_node_pair[0])
+        graph_proto.node.remove(cast_node_pair[1])
+# Check if the model is already converted to float16
+def check_if_fp16_ready(graph_proto):
+    # Check graph input and ouput
+    is_value_info_fp16 = False
+    for value_info in itertools.chain(
+        graph_proto.output, graph_proto.input, graph_proto.value_info
+    ):
+        if value_info.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT16:
+            is_value_info_fp16 = True
+            break
+    # Check initializer
+    is_initializer_fp16 = False
+    for initializer in graph_proto.initializer:
+        if initializer.data_type == onnx_proto.TensorProto.FLOAT16:
+            is_initializer_fp16 = True
+            break
+    # Check cast node
+    has_cast_node_fp16 = False
+    for node in graph_proto.node:
+        if node.op_type == "Cast" and node.attribute[0].i == FLOAT16:
+            has_cast_node_fp16 = True
+            break
+    # Any of above flags is True, return True
+    if is_value_info_fp16 or is_initializer_fp16 or has_cast_node_fp16:
+        return True  # already converted to float16
+    else:
+        return False  # not converted to float16 yet

quantize_extended.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""
+Script to quantize ONNX models to additional formats: int4, int8, etc.
+Based on transformers.js/scripts/quantize.py, extended for more quantization options.
+"""
+from enum import Enum
+from tqdm import tqdm
+from typing import Set, List, Optional
+import onnx
+import os
+from dataclasses import dataclass, field
+from transformers import HfArgumentParser
+from onnxruntime.quantization import QuantType, QuantizationMode
+from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer
+from onnxruntime.quantization.registry import IntegerOpsRegistry
+from onnxruntime.quantization.matmul_nbits_quantizer import MatMulNBitsQuantizer
+from onnxruntime.quantization.matmul_bnb4_quantizer import MatMulBnb4Quantizer
+import float16
+import utils
+class QuantMode(Enum):
+    FP16 = "fp16"
+    Q8 = "q8"
+    QI8 = "int8"
+    QU8 = "uint8"
+    Q4 = "q4"
+    Q4F16 = "q4f16"
+    BNB4 = "bnb4"
+    INT4 = "int4"
+    INT8 = "int8"
+QUANTIZE_SUFFIX_MAPPING = {
+    QuantMode.Q8: "quantized",
+    QuantMode.INT4: "int4",
+    QuantMode.INT8: "int8",
+}
+QUANTIZE_OPTIONS = tuple(x.value for x in QuantMode)
+QUINT8_OPS = (
+    "Conv",
+    "GroupQueryAttention",
+    "MultiHeadAttention",
+)
+@dataclass
+class IOArguments:
+    input_folder: str = field(metadata={"help": "Path of the input folder containing the .onnx models to quantize"})
+    output_folder: str = field(metadata={"help": "Path of the output folder where the quantized .onnx models will be saved"})
+@dataclass
+class QuantizationArguments:
+    modes: QuantMode = field(default=QUANTIZE_OPTIONS, metadata={"help": "Quantization mode to use.", "choices": QUANTIZE_OPTIONS, "nargs": "+",})
+    per_channel: bool = field(default=None, metadata={"help": "Whether to quantize weights per channel"})
+    reduce_range: bool = field(default=None, metadata={"help": "Whether to quantize weights with 7-bits."})
+    block_size: int = field(default=None, metadata={"help": "Block size for blockwise quantization."})
+    is_symmetric: bool = field(default=True, metadata={"help": "Indicate whether to quantize the model symmetrically"})
+    accuracy_level: int = field(default=None, metadata={"help": "Accuracy level of the 4-bit quantized MatMul computation."})
+    quant_type: int = field(default=MatMulBnb4Quantizer.NF4, metadata={"help": "Quantization data type. 0: FP4, 1: NF4", "choices": [MatMulBnb4Quantizer.FP4, MatMulBnb4Quantizer.NF4],})
+    op_block_list: List[str] = field(default=None, metadata={"help": "List of operators to exclude from quantization.", "nargs": "+",})
+def quantize_int4(
+    model: onnx.ModelProto,
+    save_path: str,
+    block_size: int = 32,
+    is_symmetric: bool = True,
+    accuracy_level: int = 4,
+):
+    """
+    Quantize the weights of the model from float32 to 4-bit int using MatMulNBitsQuantizer
+    """
+    quantizer = MatMulNBitsQuantizer(
+        model=model,
+        block_size=block_size,
+        is_symmetric=is_symmetric,
+        accuracy_level=accuracy_level,
+    )
+    quantizer.process()
+    utils.check_and_save_model(quantizer.model.model, save_path)
+    return quantizer.model.model
+def quantize_int8(
+    model: onnx.ModelProto,
+    save_path: str,
+    per_channel: bool = False,
+    reduce_range: bool = False,
+    weight_type: QuantType = QuantType.QInt8,
+    op_block_list: Optional[List[str]] = None,
+):
+    """
+    Quantize the weights of the model from float32 to int8
+    """
+    op_types_to_quantize = set(IntegerOpsRegistry.keys())
+    if op_block_list is not None:
+        op_types_to_quantize.difference_update(op_block_list)
+    quantizer = ONNXQuantizer(
+        model,
+        per_channel,
+        reduce_range,
+        mode=QuantizationMode.IntegerOps,
+        static=False,
+        weight_qType=weight_type,
+        activation_qType=QuantType.QUInt8,
+        tensors_range=None,
+        nodes_to_quantize=[],
+        nodes_to_exclude=[],
+        op_types_to_quantize=op_types_to_quantize,
+        extra_options=dict(EnableSubgraph=True, MatMulConstBOnly=True),
+    )
+    quantizer.quantize_model()
+    utils.check_and_save_model(quantizer.model.model, save_path)
+    return quantizer.model.model
+def main():
+    parser = HfArgumentParser((IOArguments, QuantizationArguments))
+    io_args, quantization_args = parser.parse_args_into_dataclasses()
+    input_folder = io_args.input_folder
+    output_folder = io_args.output_folder
+    if not quantization_args.modes:
+        raise ValueError("At least one quantization mode must be specified")
+    if not os.path.exists(input_folder):
+        raise ValueError(f"Input folder {input_folder} does not exist")
+    model_names_or_paths = [
+        os.path.join(input_folder, file)
+        for file in os.listdir(input_folder)
+        if file.endswith(".onnx")
+    ]
+    if not model_names_or_paths:
+        raise ValueError(f"No .onnx models found in {input_folder}")
+    os.makedirs(output_folder, exist_ok=True)
+    for model_path in tqdm(model_names_or_paths, desc="Models"):
+        file_name_without_extension = os.path.splitext(os.path.basename(model_path))[0]
+        model = onnx.load_model(model_path)
+        for mode in tqdm(quantization_args.modes, desc="Modes"):
+            try:
+                suffix = QUANTIZE_SUFFIX_MAPPING.get(QuantMode(mode), mode)
+            except Exception:
+                suffix = mode
+            save_path = os.path.join(output_folder, f"{file_name_without_extension}_{suffix}.onnx")
+            mode_enum = QuantMode(mode)
+            try:
+                if mode_enum == QuantMode.FP16:
+                    float16.convert_float_to_float16(
+                        model,
+                        keep_io_types=True,
+                        disable_shape_infer=False,
+                        op_block_list=quantization_args.op_block_list or []
+                    )
+                elif mode_enum == QuantMode.INT4 or mode_enum == QuantMode.Q4:
+                    quantize_int4(
+                        model,
+                        save_path,
+                        block_size=quantization_args.block_size or 32,
+                        is_symmetric=quantization_args.is_symmetric,
+                        accuracy_level=quantization_args.accuracy_level or 0,
+                    )
+                elif mode_enum == QuantMode.INT8 or mode_enum == QuantMode.QI8:
+                    quantize_int8(
+                        model,
+                        save_path,
+                        per_channel=quantization_args.per_channel or False,
+                        reduce_range=quantization_args.reduce_range or False,
+                        weight_type=QuantType.QInt8,
+                        op_block_list=quantization_args.op_block_list,
+                    )
+                elif mode_enum == QuantMode.Q8:
+                    quantize_int8(
+                        model,
+                        save_path,
+                        per_channel=quantization_args.per_channel or False,
+                        reduce_range=quantization_args.reduce_range or False,
+                        weight_type=QuantType.QUInt8,
+                        op_block_list=quantization_args.op_block_list,
+                    )
+                # Add other modes as needed (Q4F16, BNB4, QU8, etc.)
+            except Exception as e:
+                print(f"[WARN] Quantization mode '{mode}' failed for model '{model_path}': {e}")
+                continue
+if __name__ == "__main__":
+    main()

utils.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import onnx
+from typing import Optional, Union
+from pathlib import Path
+import os
+import logging
+logger = logging.getLogger(__name__)
+# https://github.com/onnx/onnx/pull/6556
+MAXIMUM_PROTOBUF = 2147483648  # 2GiB
+def strict_check_model(model_or_path: Union[onnx.ModelProto, str, Path]):
+    try:
+        onnx.checker.check_model(model_or_path, full_check=True)
+    except Exception as e:
+        if "No Op registered for" in str(e):
+            pass
+        else:
+            raise e
+def check_and_save_model(model: onnx.ModelProto, save_path: Optional[Union[str, Path]]):
+    if model.ByteSize() < MAXIMUM_PROTOBUF:
+        strict_check_model(model)
+        if save_path:
+            save_path = Path(save_path).as_posix()
+            external_file_name = os.path.basename(save_path) + "_data"
+            external_path = os.path.join(os.path.dirname(save_path), external_file_name)
+            if save_path.endswith(".onnx") and os.path.isfile(save_path):
+                os.remove(save_path)
+            if os.path.isfile(external_path):
+                os.remove(external_path)
+            onnx.save(
+                model,
+                save_path,
+                convert_attribute=True,
+            )
+    elif save_path is not None:
+        # path/to/model.onnx
+        save_path = Path(save_path).as_posix()
+        external_file_name = os.path.basename(save_path) + "_data"
+        # path/to/model.onnx_data
+        external_path = os.path.join(os.path.dirname(save_path), external_file_name)
+        if save_path.endswith(".onnx") and os.path.isfile(save_path):
+            os.remove(save_path)
+        if os.path.isfile(external_path):
+            os.remove(external_path)
+        onnx.save(
+            model,
+            save_path,
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            location=external_file_name,
+            convert_attribute=True,
+        )
+    else:
+        logger.info(
+            "Merged ONNX model exceeds 2GB, the model will not be checked without `save_path` given."
+        )