Alex Sadleir

add int4/int8

a1edf95 17 days ago

33.9 kB

	# MIT License
	#
	# Copyright (c) Microsoft Corporation, Hugging Face. All rights reserved.
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.


	from typing import Optional
	import itertools
	import numpy as np
	import onnx
	import packaging.version as pv
	import warnings
	from onnx import helper, numpy_helper
	from onnx import onnx_pb as onnx_proto
	import onnxslim.third_party.onnx_graphsurgeon as gs


	FLOAT32 = 1
	FLOAT16 = 10


	def _npfloat16_to_int(np_list):
	"""
	Convert numpy float16 to python int.

	:param np_list: numpy float16 list
	:return int_list: python int list
	"""
	return [int(bin(_.view("H"))[2:].zfill(16), 2) for _ in np_list]


	def convert_np_to_float16(np_array, min_positive_val=1e-7, max_finite_val=1e4):
	"""
	Convert float32 numpy array to float16 without changing sign or finiteness.
	Positive values less than min_positive_val are mapped to min_positive_val.
	Positive finite values greater than max_finite_val are mapped to max_finite_val.
	Similar for negative values. NaN, 0, inf, and -inf are unchanged.
	"""

	def between(a, b, c):
	return np.logical_and(a < b, b < c)

	positive_values = np_array[np.where(np_array > 0)]
	if positive_values.shape[0] > 0:
	pos_max = positive_values.max()
	pos_min = positive_values.min()

	if pos_max >= max_finite_val:
	warnings.warn(
	"the float32 number {} will be truncated to {}".format(
	pos_max, max_finite_val
	)
	)

	if pos_min <= min_positive_val:
	warnings.warn(
	"the float32 number {} will be truncated to {}".format(
	pos_min, min_positive_val
	)
	)

	negative_values = np_array[np.where(np_array < 0)]
	if negative_values.shape[0] > 0:
	neg_max = negative_values.max()
	neg_min = negative_values.min()

	if neg_min <= -max_finite_val:
	warnings.warn(
	"the float32 number {} will be truncated to {}".format(
	neg_min, -max_finite_val
	)
	)

	if neg_max >= -min_positive_val:
	warnings.warn(
	"the float32 number {} will be truncated to {}".format(
	neg_max, -min_positive_val
	)
	)

	np_array = np.where(
	between(0, np_array, min_positive_val), min_positive_val, np_array
	)
	np_array = np.where(
	between(-min_positive_val, np_array, 0), -min_positive_val, np_array
	)
	np_array = np.where(
	between(max_finite_val, np_array, float("inf")), max_finite_val, np_array
	)
	np_array = np.where(
	between(float("-inf"), np_array, -max_finite_val), -max_finite_val, np_array
	)
	return np.float16(np_array)


	def convert_tensor_float_to_float16(tensor, min_positive_val=1e-7, max_finite_val=1e4):
	"""
	Convert tensor float to float16.

	:param tensor: TensorProto object
	:return tensor_float16: converted TensorProto object
	"""
	if not isinstance(tensor, onnx_proto.TensorProto):
	raise ValueError(
	"Expected input type is an ONNX TensorProto but got %s" % type(tensor)
	)

	if tensor.data_type == onnx_proto.TensorProto.FLOAT:
	tensor.data_type = onnx_proto.TensorProto.FLOAT16
	# convert float_data (float type) to float16 and write to int32_data
	if tensor.float_data:
	float16_data = convert_np_to_float16(
	np.array(tensor.float_data), min_positive_val, max_finite_val
	)
	int_list = _npfloat16_to_int(float16_data)
	tensor.int32_data[:] = int_list
	tensor.float_data[:] = []
	# convert raw_data (bytes type)
	if tensor.raw_data:
	# convert n.raw_data to float
	float32_list = np.fromstring(tensor.raw_data, dtype="float32")
	# convert float to float16
	float16_list = convert_np_to_float16(
	float32_list, min_positive_val, max_finite_val
	)
	# convert float16 to bytes and write back to raw_data
	tensor.raw_data = float16_list.tostring()
	return tensor


	def make_value_info_from_tensor(tensor):
	shape = numpy_helper.to_array(tensor).shape
	return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)


	DEFAULT_OP_BLOCK_LIST = [
	"ArrayFeatureExtractor",
	"Binarizer",
	"CastMap",
	"CategoryMapper",
	"DictVectorizer",
	"FeatureVectorizer",
	"Imputer",
	"LabelEncoder",
	"LinearClassifier",
	"LinearRegressor",
	"Normalizer",
	"OneHotEncoder",
	"RandomUniformLike",
	"SVMClassifier",
	"SVMRegressor",
	"Scaler",
	"TreeEnsembleClassifier",
	"TreeEnsembleRegressor",
	"ZipMap",
	"NonMaxSuppression",
	"TopK",
	"RoiAlign",
	"Resize",
	# 'Range',
	"CumSum",
	"Min",
	"Max",
	"Upsample",
	# NEW:
	"RandomNormalLike",
	# TODO: Ideally, "Cast" nodes should not be here, for the following reasons:
	# - It breaks the semantics that the default list contains "ops that are not supported for float16 in ONNX Runtime".
	# - When fp32 casts already exist in the model (e.g., for rotary embeddings), this script will insert redundant casts around it.
	# However, without it, the graphs produced are invalid. Eventually, we will resolve this.
	"Cast",
	]


	def initial_checking(model, disable_shape_infer):
	func_infer_shape = None
	if not disable_shape_infer and pv.Version(onnx.__version__) >= pv.Version("1.2"):
	try:
	from onnx.shape_inference import infer_shapes

	func_infer_shape = infer_shapes
	finally:
	pass

	if not isinstance(model, onnx_proto.ModelProto):
	raise ValueError(
	"Expected model type is an ONNX ModelProto but got %s" % type(model)
	)

	if func_infer_shape is not None:
	model = func_infer_shape(model)

	is_fp16_ready_flag = check_if_fp16_ready(model.graph)

	return model, func_infer_shape, is_fp16_ready_flag


	def convert_float_to_float16(
	model,
	min_positive_val=1e-7,
	max_finite_val=1e4,
	keep_io_types=False,
	disable_shape_infer=False,
	op_block_list=None,
	node_block_list=None,
	check_fp16_ready=True,
	):

	# create blocklists
	if op_block_list is None:
	op_block_list = DEFAULT_OP_BLOCK_LIST
	if node_block_list is None:
	node_block_list = []
	op_block_list = set(op_block_list)
	node_block_list = set(node_block_list)

	global_input_name_dict = (
	{}
	) # key: input name, value: new output name after Cast node
	# basic checking, including shape inference
	model, func_infer_shape, is_fp16_ready_flag = initial_checking(
	model, disable_shape_infer
	)
	if is_fp16_ready_flag and check_fp16_ready:
	raise ValueError(
	"The model is already converted to float16, if convert again, the model might be wrong. \n If you are sure to convert again, please set check_fp16_ready=False."
	)

	graph_stack = [model.graph]

	is_top_level = True
	while graph_stack:
	next_level = []
	for curr_graph in graph_stack:
	process_graph_input(
	curr_graph, is_top_level, keep_io_types, global_input_name_dict
	)
	value_info_block_list = process_tensor_in_node(
	curr_graph,
	op_block_list,
	node_block_list,
	min_positive_val,
	max_finite_val,
	)
	process_value_info(curr_graph, value_info_block_list)
	process_node_in_block_list(
	curr_graph, global_input_name_dict, op_block_list, node_block_list
	)
	process_initializers(
	curr_graph,
	op_block_list,
	node_block_list,
	min_positive_val,
	max_finite_val,
	)
	process_graph_output(curr_graph, is_top_level, keep_io_types)
	sub_graph_list = get_next_level_graph(
	curr_graph, op_block_list, node_block_list
	)
	if len(sub_graph_list) > 0:
	next_level.extend(sub_graph_list)

	if not is_top_level:
	process_node_input_output(curr_graph, global_input_name_dict)
	is_top_level = False # Going to process sub-graph
	graph_stack = next_level

	remove_unnecessary_cast_node(model.graph)

	# Topologically sort the graph
	# NOTE: We do not perform another round of optimization as the model is already optimized
	graph = gs.import_onnx(model)
	graph.toposort()
	model = gs.export_onnx(graph)

	return model


	# Change the input/output of the node to the new output name after Cast node for sub-graph
	# Because there have NO value_info start from
	def process_node_input_output(
	graph: onnx_proto.GraphProto, global_input_name_dict: dict
	):
	for node in graph.node:
	for i, input_name in enumerate(node.input):
	if input_name in global_input_name_dict:
	node.input[i] = global_input_name_dict[input_name]
	for i, output_name in enumerate(node.output):
	if output_name in global_input_name_dict:
	node.output[i] = global_input_name_dict[output_name]


	def process_graph_input(
	graph: onnx_proto.GraphProto,
	is_top_level: bool,
	is_io_fp32: bool,
	global_input_name_dict: dict,
	):
	# The input dtype is float32, need to cast to fp16
	if is_top_level and is_io_fp32:
	for graph_input in graph.input: # n_input is ValueInfoProto
	if graph_input.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
	downstream_nodes = find_downstream_node_by_input_name(
	graph, graph_input.name
	)
	for d_node in downstream_nodes:
	# More than one node may consume the model input, so we only create
	# a single cast node, and then reuse this node when needed.
	cast_exists = graph_input.name in global_input_name_dict
	if cast_exists:
	cast_node_output_name = global_input_name_dict[graph_input.name]
	else:
	cast_node_output_name = graph_input.name + "_fp16"
	add_cast_node(
	graph,
	[graph_input.name],
	[cast_node_output_name],
	cast_node_output_name, # Set node name same as output name
	FLOAT16,
	)
	add_new_value_info(
	graph,
	graph_input,
	cast_node_output_name,
	onnx_proto.TensorProto.FLOAT16,
	)
	for i, input_name in enumerate(d_node.input):
	if input_name == graph_input.name:
	d_node.input[i] = (
	cast_node_output_name # Change the input of the second node
	)
	global_input_name_dict[graph_input.name] = (
	cast_node_output_name
	)

	# For the sub-graph, don't do cast
	else: # Change the input dtype to fp16 without any cast
	for graph_input in graph.input:
	if graph_input.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
	graph_input.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16


	def process_graph_output(
	graph: onnx_proto.GraphProto, is_top_level: bool, is_io_fp32: bool
	):
	if is_top_level and is_io_fp32: # the output dtype is float32, need to cast to fp16
	for i, graph_output in enumerate(graph.output):
	if graph_output.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
	new_producer_name = graph_output.name + "_fp16"
	original_name = graph_output.name # The correct output name

	# Get the node(s) that produce the model output
	# These will most likely be fp16, but could be fp32 if the previous node is in block_list
	upstream_nodes = find_upstream_node_by_output_name(graph, original_name)
	assert len(upstream_nodes) == 1 # Should be only one node

	producer_node = upstream_nodes[0]

	for i, output_name in enumerate(producer_node.output):
	if output_name == original_name:
	producer_node.output[i] = new_producer_name

	cast_node_name = new_producer_name + "_input_cast" + str(i)
	add_cast_node(
	graph,
	[new_producer_name],
	[original_name],
	cast_node_name,
	onnx_proto.TensorProto.FLOAT,
	)
	for value_info in graph.value_info:
	if original_name == value_info.name:
	value_info.type.tensor_type.elem_type = (
	onnx_proto.TensorProto.FLOAT
	)

	# Get the node(s) that consume the model output
	downstream_nodes = find_downstream_node_by_input_name(
	graph,
	original_name,
	include_subgraphs=False,
	)

	# It is possible that the producer node is also input to downstream nodes
	# So, we update the inputs of these downstream nodes
	for d_node in downstream_nodes:
	for i, input_name in enumerate(d_node.input):
	if input_name == original_name:
	d_node.input[i] = new_producer_name

	else: # change the output dtype to fp16 in tensor
	for graph_output in graph.output:
	if graph_output.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
	graph_output.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16


	def process_node_in_block_list(
	graph: onnx_proto.GraphProto,
	global_input_name_dict: dict,
	op_block_list: list,
	node_block_list: list,
	):
	# NB: Important to create a copy of the nodes in the graph to avoid modifying
	# the graph in-place while iterating (causing an infinite loop)
	for node in list(graph.node):
	if (node.op_type in op_block_list) or (node.name in node_block_list):
	insert_cast32_before_node(graph, node, global_input_name_dict)
	insert_cast16_after_node(graph, node, global_input_name_dict)


	# Todo: global_input_name_dict still not fill value
	def insert_cast32_before_node(
	graph: onnx_proto.GraphProto, node: onnx_proto.NodeProto, global_input_name_dict
	):
	for i, input_name in enumerate(node.input):
	for value_info in itertools.chain(graph.value_info, graph.input):
	if input_name == value_info.name:
	if (
	value_info.type.tensor_type.elem_type
	!= onnx_proto.TensorProto.FLOAT16
	):
	break
	cast_output_name = node.name + "_input_cast_" + str(i)
	add_new_value_info(
	graph, value_info, cast_output_name, onnx_proto.TensorProto.FLOAT
	)
	cast_node_name = node.name + "_input_cast" + str(i)
	add_cast_node(
	graph,
	[input_name],
	[cast_output_name],
	cast_node_name,
	onnx_proto.TensorProto.FLOAT,
	)
	node.input[i] = cast_output_name
	break


	# Todo: global_input_name_dict still not fill value
	def insert_cast16_after_node(
	graph: onnx_proto.GraphProto, node: onnx_proto.NodeProto, global_input_name_dict
	):
	for i, output_name in enumerate(node.output):
	for value_info in itertools.chain(graph.value_info, graph.output):
	if output_name == value_info.name:
	if (
	value_info.type.tensor_type.elem_type
	!= onnx_proto.TensorProto.FLOAT
	):
	break
	cast_input_name = node.name + "_output_cast_" + str(i)
	add_new_value_info(
	graph, value_info, cast_input_name, onnx_proto.TensorProto.FLOAT
	)
	value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
	cast_node_name = node.name + "_output_cast" + str(i)
	add_cast_node(
	graph,
	[cast_input_name],
	[output_name],
	cast_node_name,
	onnx_proto.TensorProto.FLOAT16,
	)
	node.output[i] = cast_input_name
	break


	# Process tensor data in attribute of the node
	def process_tensor_in_node(
	graph: onnx_proto.GraphProto,
	op_block_list: list,
	node_block_list: list,
	min_positive_val,
	max_finite_val,
	):
	value_info_block_list = set() # This is for later use, not in this step
	for node in graph.node:
	# NOTE: "Cast" operation cannot change its output type because it is strongly typed.
	if (
	(node.op_type in op_block_list)
	or (node.name in node_block_list)
	or (node.op_type == "Cast")
	):
	# if (node.op_type in op_block_list) or (node.name in node_block_list):
	# Only need to block the output value_info changing
	for output_name in node.output:
	value_info_block_list.add(output_name)
	else:
	for attr in node.attribute:
	# one tensor
	if attr.t.data_type == onnx_proto.TensorProto.FLOAT:
	attr.t.CopyFrom(
	convert_tensor_float_to_float16(
	attr.t, min_positive_val, max_finite_val
	)
	)
	# list of tensor
	for t in attr.tensors:
	if t.data_type == onnx_proto.TensorProto.FLOAT:
	t.CopyFrom(
	convert_tensor_float_to_float16(
	t, min_positive_val, max_finite_val
	)
	)
	return value_info_block_list


	# Change all the value info type from float32 to float16 if not in block list
	def process_value_info(graph: onnx_proto.GraphProto, value_info_block_list: list):
	for value_info in graph.value_info:
	if value_info.name in value_info_block_list:
	continue
	else:
	if value_info.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
	value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16


	# Initializer is 'edge' type, so doesn't have value_info
	def process_initializers(
	graph: onnx_proto.GraphProto,
	op_block_list,
	node_block_list,
	min_positive_val,
	max_finite_val,
	):
	# Find the input of the block node, don't need to change this kind of initializer
	initializer_block_list = set()
	for node in graph.node:
	if (node.op_type in op_block_list) or (node.name in node_block_list):
	for (
	input_name
	) in (
	node.input
	): # some is initializer, some is value_info, can't distinguish but doesn't matter
	initializer_block_list.add(input_name)
	# Process initializers
	for initializer in graph.initializer:
	if initializer.name not in initializer_block_list:
	if initializer.data_type == onnx_proto.TensorProto.FLOAT:
	convert_tensor_float_to_float16(
	initializer, min_positive_val, max_finite_val
	)


	def get_next_level_graph(
	graph: onnx_proto.GraphProto, op_block_list: list, node_block_list: list
	):
	sub_graph_list = []
	for node in graph.node:
	if node.op_type in op_block_list or node.name in node_block_list:
	continue
	for attr in node.attribute:
	# Check if sub-graph exist
	if len(attr.g.node) > 0: # single sub-graph
	sub_graph_list.append(attr.g)
	for g in attr.graphs:
	if len(g.node) > 0: # multiple sub-graphs
	sub_graph_list.append(g)
	return sub_graph_list


	def add_cast_node(
	graph: onnx_proto.GraphProto,
	inputs: list,
	outputs: list,
	node_name: str,
	to_type: int,
	):
	new_node = [helper.make_node("Cast", inputs, outputs, to=to_type, name=node_name)]
	graph.node.extend(new_node)


	def add_new_value_info(
	graph: onnx_proto.GraphProto,
	exist_value_info: onnx_proto.ValueInfoProto,
	name: str,
	dtype: int,
	):
	new_value_info = graph.value_info.add()
	new_value_info.CopyFrom(exist_value_info)
	new_value_info.name = name
	new_value_info.type.tensor_type.elem_type = dtype


	# Find the node that has the specified output name
	def find_upstream_node_by_output_name(graph: onnx_proto.GraphProto, output_name: str):
	nodes = []
	for node in graph.node:
	if output_name in node.output:
	nodes.append(node)
	assert len(nodes) <= 1 # Suppose there is less than one node found
	return nodes


	# Find the node that has the specified input name, including in subgraphs
	def find_downstream_node_by_input_name(
	graph: onnx_proto.GraphProto, input_name: str, include_subgraphs=True
	):
	nodes = []

	# Check nodes in current graph
	for node in graph.node:
	if input_name in node.input:
	nodes.append(node)

	if not include_subgraphs:
	continue

	# Recursively check subgraphs in node attributes
	for attr in node.attribute:
	if attr.type == onnx_proto.AttributeProto.GRAPH:
	# Single subgraph
	if len(attr.g.node) > 0:
	nodes.extend(find_downstream_node_by_input_name(attr.g, input_name))

	# Multiple subgraphs
	if attr.type == onnx_proto.AttributeProto.GRAPHS:
	for g in attr.graphs:
	if len(g.node) > 0:
	nodes.extend(find_downstream_node_by_input_name(g, input_name))

	return nodes


	# Remove identity node
	def remove_identity_node_from_model(model: onnx_proto.ModelProto):
	remove_identity_node_from_graph(model.graph)
	try:
	from onnx.shape_inference import infer_shapes

	func_infer_shape = infer_shapes
	model = func_infer_shape(model)
	return model
	finally:
	pass


	# Remove identity node
	def remove_identity_node_from_graph(graph: onnx_proto.GraphProto):
	for curr_node in graph.node:
	if curr_node.op_type == "Identity":
	for input_name in curr_node.input:
	upstream_nodes = find_upstream_node_by_output_name(graph, input_name)
	for u_node in upstream_nodes:
	if u_node is not None:
	u_node.output[0] = curr_node.output[0]
	graph.node.remove(curr_node)


	def convert_float_to_float16_model_path(
	model_path, min_positive_val=1e-7, max_finite_val=1e4, keep_io_types=False
	):
	"""
	Convert tensor float type in the ONNX Model to tensor float16.
	*It is to fix an issue that infer_shapes func cannot be used to infer >2GB models.
	*But this function can be applied to all model sizes.
	:param model_path: ONNX Model path
	:return: converted ONNX ModelProto object
	Examples
	::
	#Convert to ONNX ModelProto object and save model binary file:
	from onnxmltools.utils.float16_converter import convert_float_to_float16_model_path
	new_onnx_model = convert_float_to_float16_model_path('model.onnx')
	onnx.save(new_onnx_model, 'new_model.onnx')
	"""

	disable_shape_infer = False
	if pv.Version(onnx.__version__) >= pv.Version("1.8"):
	try:
	# infer_shapes_path can be applied to all model sizes
	from onnx.shape_inference import infer_shapes_path
	import tempfile
	import os

	# shape_infer_model_path should be in the same folder of model_path
	with tempfile.NamedTemporaryFile(
	dir=os.path.dirname(model_path)
	) as tmpfile:
	shape_infer_model_path = tmpfile.name
	infer_shapes_path(model_path, shape_infer_model_path)
	model = onnx.load(shape_infer_model_path)
	disable_shape_infer = True
	finally:
	pass
	if not disable_shape_infer:
	model = onnx.load(model_path)
	return convert_float_to_float16(
	model, min_positive_val, max_finite_val, keep_io_types, disable_shape_infer
	)


	def remove_unnecessary_cast_node(graph_proto: onnx_proto.GraphProto):
	# 1. find all cast nodes in the graph
	cast_node_list = []
	input_name_to_cast_node_dict = {}
	output_name_to_cast_node_dict = {}
	# using name as key to point to a node. because node object cannot be key
	name_to_node_dict = {}
	for node in graph_proto.node:
	if node.op_type == "Cast":
	# if node.name not in ["graph_input_cast0", "graph_output_cast0"]:
	cast_node_list.append(node)

	name_to_node_dict[node.name] = node
	for input_name in node.input:
	input_name_to_cast_node_dict[input_name] = node
	for output_name in node.output:
	output_name_to_cast_node_dict[output_name] = node

	# 2. find upstream and downstream node of the cast node
	cast_node_upstream_dict = {} # mapping cast node(name) to its upstream node
	cast_node_downstream_dict = {} # mapping cast node(name) to its downstream node
	for current_node in graph_proto.node:
	# find the downstream node(s)
	for input_name in current_node.input:
	if input_name in output_name_to_cast_node_dict:
	# found the downstream node of the cast node, might be multiple
	cast_node = output_name_to_cast_node_dict[input_name]
	if cast_node.name not in cast_node_downstream_dict:
	cast_node_downstream_dict[cast_node.name] = current_node
	else: # already exists one downstream node, make it a list
	existing_downstream_nodes = cast_node_downstream_dict[
	cast_node.name
	]
	if isinstance(existing_downstream_nodes, list):
	existing_downstream_nodes.append(current_node)
	else: # make a list
	existing_downstream_nodes = [
	existing_downstream_nodes,
	current_node,
	]
	cast_node_downstream_dict[cast_node.name] = (
	existing_downstream_nodes
	)
	# find the upstream node
	for output_name in current_node.output:
	if output_name in input_name_to_cast_node_dict:
	# found the upstream node of the cast node, should be unique
	cast_node = input_name_to_cast_node_dict[output_name]
	cast_node_upstream_dict[cast_node.name] = current_node

	# 3. remove the cast node which upstream is 'Constant'
	for cast_node_name, upstream_node in cast_node_upstream_dict.items():
	cast_node = name_to_node_dict[cast_node_name]
	if upstream_node.op_type == "Constant":
	cast_node_list.remove(cast_node)

	# 4. find (cast_to_fp16, cast_to_fp32) pairs where --fp32--> cast_to_fp16 --fp16--> cast_to_fp32.
	remove_candidate = []

	name_to_value_info = {
	value_info.name: value_info
	for value_info in itertools.chain(graph_proto.value_info, graph_proto.input)
	}

	def get_type(name: str) -> Optional[int]:
	if name in name_to_value_info:
	return name_to_value_info[name].type
	else:
	# `name` has no value info.
	return None

	for cast_node_name, downstream_node in cast_node_downstream_dict.items():
	cast_node = name_to_node_dict[cast_node_name]
	if len(cast_node.input) != 1:
	raise RuntimeError(
	f"Cast node {cast_node_name} should have only one input, but has {len(cast_node.input)}."
	)

	input_type = get_type(cast_node.input[0])
	if input_type != onnx_proto.TensorProto.FLOAT:
	continue
	if isinstance(downstream_node, list):
	for dn in downstream_node:
	if (
	dn.op_type == "Cast"
	and dn.attribute[0].i == 32
	and cast_node.attribute[0].i == 16
	and dn in cast_node_list
	and cast_node in cast_node_list
	):
	remove_candidate.append((cast_node, dn))
	else:
	if (
	downstream_node.op_type == "Cast"
	and cast_node.attribute[0].i == FLOAT16
	and downstream_node.attribute[0].i == FLOAT32
	and downstream_node in cast_node_list
	and cast_node in cast_node_list
	):
	remove_candidate.append((cast_node, downstream_node))

	# 5. change "upstream --fp32--> cast_to_fp16 --fp16--> cast_to_fp32 --fp32--> downstream" to
	# "upstream --fp32--> downstream".
	for cast_node_pair in remove_candidate:
	first_cast_node = cast_node_pair[0]
	second_cast_node = cast_node_pair[1]
	upstream_node = cast_node_upstream_dict.get(first_cast_node.name)
	downstream_node = cast_node_downstream_dict.get(second_cast_node.name)
	if upstream_node is None and downstream_node is not None:
	# The upstream_node should be graph input
	out = first_cast_node.input[0]
	for i, input_name in enumerate(downstream_node.input):
	for output_name in second_cast_node.output:
	if input_name == output_name:
	# change the input as the upstream node's output
	downstream_node.input[i] = out
	elif upstream_node is not None and downstream_node is None:
	raise ValueError(
	"The downstream node of the second cast node should be graph output"
	)
	else:
	# find the upstream node's output to first_cast_node
	out = None
	for output_name in upstream_node.output:
	if output_name == first_cast_node.input[0]:
	out = output_name
	break
	# find the downstream node's input as second_cast_node's output
	for i, input_name in enumerate(downstream_node.input):
	for output_name in second_cast_node.output:
	if input_name == output_name:
	# change the input as the upstream node's output
	downstream_node.input[i] = out

	# 6. remove the cast node pair
	for cast_node_pair in remove_candidate:
	graph_proto.node.remove(cast_node_pair[0])
	graph_proto.node.remove(cast_node_pair[1])


	# Check if the model is already converted to float16
	def check_if_fp16_ready(graph_proto):
	# Check graph input and ouput
	is_value_info_fp16 = False
	for value_info in itertools.chain(
	graph_proto.output, graph_proto.input, graph_proto.value_info
	):
	if value_info.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT16:
	is_value_info_fp16 = True
	break

	# Check initializer
	is_initializer_fp16 = False
	for initializer in graph_proto.initializer:
	if initializer.data_type == onnx_proto.TensorProto.FLOAT16:
	is_initializer_fp16 = True
	break

	# Check cast node
	has_cast_node_fp16 = False
	for node in graph_proto.node:
	if node.op_type == "Cast" and node.attribute[0].i == FLOAT16:
	has_cast_node_fp16 = True
	break

	# Any of above flags is True, return True
	if is_value_info_fp16 or is_initializer_fp16 or has_cast_node_fp16:
	return True # already converted to float16
	else:
	return False # not converted to float16 yet