Skip to content

Commit 299af9a

Browse files
authored
Support more ONNX INT8 ops (#1349)
Signed-off-by: yuwenzho <[email protected]>
1 parent b7d5d6e commit 299af9a

File tree

4 files changed

+350
-2
lines changed

4 files changed

+350
-2
lines changed

neural_compressor/adaptor/onnxrt.yaml

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,16 @@
379379
'activation': *uint8_asym_pertensor_minmax,
380380
'mode': ['QDQ', 'QLinear']
381381
},
382+
'GatherElements': {
383+
'weight': *uint8_asym_perchanneltensor_minmax,
384+
'activation': *uint8_asym_pertensor_minmax,
385+
'mode': ['QDQ', 'QLinear']
386+
},
387+
'GatherND': {
388+
'weight': *uint8_asym_perchanneltensor_minmax,
389+
'activation': *uint8_asym_pertensor_minmax,
390+
'mode': ['QDQ', 'QLinear']
391+
},
382392
'MatMul': {
383393
'weight': *int8_sym_perchanneltensor_minmax,
384394
'activation': *uint8_asym_pertensor,
@@ -422,6 +432,7 @@
422432
'Mod': *default_static_qlinear_qdq_minmax,
423433
'ReduceMax': *default_static_qlinear_qdq_minmax,
424434
'ReduceMin': *default_static_qlinear_qdq_minmax,
435+
'Tile': *default_static_qlinear_qdq_minmax,
425436
},
426437
'dynamic': *ref_1_9_dynamic
427438
}
@@ -436,6 +447,88 @@
436447
recipes:
437448
<<: *default_optimization
438449

450+
-
451+
version:
452+
name: '1.13.0'
453+
int8: &ref_1_13 {
454+
'static': {
455+
'FusedConv': {
456+
'weight': *int8_sym_perchanneltensor_minmax, # QDQ: *int8_sym_pertensor_minmax
457+
'activation': *uint8_asym_pertensor_minmax,
458+
'mode': ['QDQ', 'QLinear']
459+
},
460+
'Conv': {
461+
'weight': *int8_sym_perchanneltensor_minmax,
462+
'activation': *uint8_asym_pertensor,
463+
'mode': ['QDQ', 'QLinear']
464+
},
465+
'Gather': {
466+
'weight': *uint8_asym_perchanneltensor_minmax,
467+
'activation': *uint8_asym_pertensor_minmax,
468+
'mode': ['QDQ', 'QLinear']
469+
},
470+
'GatherElements': {
471+
'weight': *uint8_asym_perchanneltensor_minmax,
472+
'activation': *uint8_asym_pertensor_minmax,
473+
'mode': ['QDQ', 'QLinear']
474+
},
475+
'GatherND': {
476+
'weight': *uint8_asym_perchanneltensor_minmax,
477+
'activation': *uint8_asym_pertensor_minmax,
478+
'mode': ['QDQ', 'QLinear']
479+
},
480+
'MatMul': {
481+
'weight': *int8_sym_perchanneltensor_minmax,
482+
'activation': *uint8_asym_pertensor,
483+
'mode': ['QDQ', 'QLinear']
484+
},
485+
'Gemm': {
486+
'weight': *int8_sym_perchanneltensor_minmax,
487+
'activation': *uint8_asym_pertensor_minmax,
488+
'mode': ['QDQ', 'QLinear']
489+
},
490+
'EmbedLayerNormalization': {
491+
'weight': *uint8_asym_pertensor_minmax, # QDQ: *int8_sym_pertensor_minmax
492+
'activation': *uint8_asym_pertensor_minmax,
493+
'mode': ['QDQ', 'QLinear']
494+
},
495+
'Attention': *default_static_qlinear_qdq_minmax,
496+
'Mul': *default_static_qlinear,
497+
'Relu': *default_static_qlinear_qdq_minmax,
498+
'Clip': *default_static_qlinear_qdq_minmax,
499+
'LeakyRelu': *default_static_qlinear_qdq_minmax,
500+
'Sigmoid': *default_static_qlinear_qdq_minmax,
501+
'MaxPool': *default_static_qlinear_qdq_minmax,
502+
'GlobalAveragePool': *default_static_qlinear_qdq_minmax,
503+
'Pad': *default_static_qlinear_qdq_minmax,
504+
'Split': *default_static_qlinear_qdq_minmax,
505+
'Add': *default_static_qlinear,
506+
'Squeeze': *default_static_qlinear_qdq_minmax,
507+
'Reshape': *default_static_qlinear_qdq_minmax,
508+
'Concat': *default_static_qlinear_qdq_minmax,
509+
'AveragePool': *default_static_qlinear_qdq_minmax,
510+
'Unsqueeze': *default_static_qlinear_qdq_minmax,
511+
'Transpose': *default_static_qlinear_qdq_minmax,
512+
'ArgMax': *default_static_qlinear,
513+
'Resize': *default_static_qlinear_qdq_minmax,
514+
'Abs': *default_static_qlinear_qdq_minmax,
515+
'Shrink': *default_static_qlinear_qdq_minmax,
516+
'Sign': *default_static_qlinear_qdq_minmax,
517+
'Flatten': *default_static_qlinear_qdq_minmax,
518+
'Expand': *default_static_qlinear_qdq_minmax,
519+
'Slice': *default_static_qlinear_qdq_minmax,
520+
'Mod': *default_static_qlinear_qdq_minmax,
521+
'ReduceMax': *default_static_qlinear_qdq_minmax,
522+
'ReduceMin': *default_static_qlinear_qdq_minmax,
523+
'Tile': *default_static_qlinear_qdq_minmax,
524+
'CenterCropPad': *default_static_qlinear_qdq_minmax,
525+
},
526+
'dynamic': *ref_1_9_dynamic
527+
}
528+
weight_only_integer: *cap_weight_only
529+
recipes:
530+
<<: *default_optimization
531+
439532
-
440533
version:
441534
name: 'default'

neural_compressor/adaptor/ox_utils/operators/direct_q8.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020

2121

2222
@op_registry(
23-
op_types="Reshape, Transpose, Squeeze, Unsqueeze, Flatten, Expand, Slice, " "SpaceToDepth, DepthToSpace, Upsample"
23+
op_types="Reshape, Transpose, Squeeze, Unsqueeze, Flatten, Expand, Slice, "
24+
"SpaceToDepth, DepthToSpace, Upsample, Tile, CenterCropPad"
2425
)
2526
class Direct8BitOperator(Operator):
2627
"""Direct8Bit Operator."""

neural_compressor/adaptor/ox_utils/operators/gather.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from neural_compressor.adaptor.ox_utils.util import attribute_to_kwarg
2323

2424

25-
@op_registry(op_types="Gather")
25+
@op_registry(op_types="Gather, GatherElements, GatherND")
2626
class GatherOperator(Operator):
2727
"""Gather Operator."""
2828

test/adaptor/onnxrt_adaptor/test_onnxrt_operators.py

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1562,6 +1562,260 @@ def test_reducemin_reducemax(self):
15621562
session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
15631563
self.assertIsNotNone(session)
15641564

1565+
def test_tile(self):
1566+
# test Tile nodes: MatMul-Tile-MatMul
1567+
input_tensor = helper.make_tensor_value_info("input", TensorProto.FLOAT, [2, 3, 4, 1])
1568+
1569+
matmul1_weight = helper.make_tensor(
1570+
"matmul1_weight", TensorProto.FLOAT, [1, 5], np.random.random((1, 5)).reshape(5).tolist()
1571+
)
1572+
matmul1_output = helper.make_tensor_value_info("matmul1_output", TensorProto.FLOAT, [2, 3, 4, 5])
1573+
matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0")
1574+
1575+
repeats = helper.make_tensor("repeats", TensorProto.INT64, [4], [2, 2, 2, 2])
1576+
tile_output = helper.make_tensor_value_info("tile_output", TensorProto.FLOAT, [4, 6, 8, 10])
1577+
tile_node = onnx.helper.make_node(
1578+
"Tile",
1579+
["matmul1_output", "repeats"],
1580+
["tile_output"],
1581+
name="Tile_1",
1582+
)
1583+
1584+
matmul2_weight = helper.make_tensor(
1585+
"matmul2_weight", TensorProto.FLOAT, [10, 1], np.random.random((10, 1)).reshape(10).tolist()
1586+
)
1587+
matmul2_output = helper.make_tensor_value_info("matmul2_output", TensorProto.FLOAT, [4, 6, 8, 1])
1588+
matmul2_node = onnx.helper.make_node(
1589+
"MatMul", ["tile_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2"
1590+
)
1591+
1592+
initializers = [matmul1_weight, matmul2_weight, repeats]
1593+
graph = helper.make_graph(
1594+
[matmul1_node, tile_node, matmul2_node],
1595+
"TestTile_test_model",
1596+
[input_tensor],
1597+
[matmul2_output],
1598+
initializer=initializers,
1599+
)
1600+
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
1601+
model.ir_version = 7
1602+
1603+
q_config = {"Matmul_0": self.static_q_config, "Tile_1": self.static_q_config, "Matmul_2": self.static_q_config}
1604+
quantize_params = {
1605+
"input": [np.uint8(10.0), np.float32(0)],
1606+
"matmul1_weight": [np.uint8(10.0), np.float32(0)],
1607+
"matmul1_output": [np.uint8(10.0), np.float32(0)],
1608+
"matmul2_weight": [np.uint8(10.0), np.float32(0)],
1609+
"matmul2_output": [np.uint8(10.0), np.float32(0)],
1610+
"tile_output": [np.uint8(10.0), np.float32(0)],
1611+
}
1612+
quantizable_op_types = ["MatMul", "Tile"]
1613+
q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
1614+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1)
1615+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
1616+
session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
1617+
self.assertIsNotNone(session)
1618+
1619+
q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
1620+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6)
1621+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
1622+
session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
1623+
self.assertIsNotNone(session)
1624+
1625+
def test_centercroppad(self):
1626+
# test CenterCropPad nodes: MatMul-CenterCropPad-MatMul
1627+
input_tensor = helper.make_tensor_value_info("input", TensorProto.FLOAT, [20, 10, 1])
1628+
1629+
matmul1_weight = helper.make_tensor(
1630+
"matmul1_weight", TensorProto.FLOAT, [1, 3], np.random.random((1, 3)).reshape(3).tolist()
1631+
)
1632+
matmul1_output = helper.make_tensor_value_info("matmul1_output", TensorProto.FLOAT, [20, 10, 3])
1633+
matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0")
1634+
1635+
centercroppad_output = helper.make_tensor_value_info("centercroppad_output", TensorProto.FLOAT, [10, 7, 3])
1636+
shape = helper.make_tensor("shape", TensorProto.INT64, [3], [10, 7, 3])
1637+
centercroppad_node = onnx.helper.make_node(
1638+
"CenterCropPad",
1639+
["matmul1_output", "shape"],
1640+
["centercroppad_output"],
1641+
name="Centercroppad_1",
1642+
)
1643+
1644+
matmul2_weight = helper.make_tensor(
1645+
"matmul2_weight", TensorProto.FLOAT, [3, 1], np.random.random((3, 1)).reshape(3).tolist()
1646+
)
1647+
matmul2_output = helper.make_tensor_value_info("matmul2_output", TensorProto.FLOAT, [10, 7, 1])
1648+
matmul2_node = onnx.helper.make_node(
1649+
"MatMul", ["centercroppad_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2"
1650+
)
1651+
1652+
initializers = [matmul1_weight, shape, matmul2_weight]
1653+
graph = helper.make_graph(
1654+
[matmul1_node, centercroppad_node, matmul2_node],
1655+
"TestCenterCropPad_test_model",
1656+
[input_tensor],
1657+
[matmul2_output],
1658+
initializer=initializers,
1659+
)
1660+
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 18)])
1661+
model.ir_version = 8
1662+
1663+
q_config = {
1664+
"Matmul_0": self.static_q_config,
1665+
"Centercroppad_1": self.static_q_config,
1666+
"Matmul_2": self.static_q_config,
1667+
}
1668+
quantize_params = {
1669+
"input": [np.uint8(10.0), np.float32(0)],
1670+
"matmul1_weight": [np.uint8(10.0), np.float32(0)],
1671+
"matmul1_output": [np.uint8(10.0), np.float32(0)],
1672+
"matmul2_weight": [np.uint8(10.0), np.float32(0)],
1673+
"matmul2_output": [np.uint8(10.0), np.float32(0)],
1674+
"centercroppad_output": [np.uint8(10.0), np.float32(0)],
1675+
}
1676+
quantizable_op_types = ["MatMul", "CenterCropPad"]
1677+
q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
1678+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1)
1679+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
1680+
session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
1681+
self.assertIsNotNone(session)
1682+
1683+
q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
1684+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6)
1685+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
1686+
session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
1687+
self.assertIsNotNone(session)
1688+
1689+
def test_gathernd(self):
1690+
# test GatherND nodes: MatMul-GatherND-MatMul
1691+
input_tensor = helper.make_tensor_value_info("input", TensorProto.FLOAT, [2, 2, 1])
1692+
1693+
matmul1_weight = helper.make_tensor(
1694+
"matmul1_weight", TensorProto.FLOAT, [1, 2], np.random.random((1, 2)).reshape(2).tolist()
1695+
)
1696+
matmul1_output = helper.make_tensor_value_info("matmul1_output", TensorProto.FLOAT, [2, 2, 2])
1697+
matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0")
1698+
1699+
gathernd_output = helper.make_tensor_value_info("gathernd_output", TensorProto.FLOAT, [2, 1, 2])
1700+
indices = helper.make_tensor("indices", TensorProto.INT64, [2, 1, 2], [0, 1, 1, 0])
1701+
gathernd_node = onnx.helper.make_node(
1702+
"GatherND",
1703+
["matmul1_output", "indices"],
1704+
["gathernd_output"],
1705+
name="Gathernd_1",
1706+
)
1707+
1708+
matmul2_weight = helper.make_tensor(
1709+
"matmul2_weight", TensorProto.FLOAT, [2, 1], np.random.random((2, 1)).reshape(2).tolist()
1710+
)
1711+
matmul2_output = helper.make_tensor_value_info("matmul2_output", TensorProto.FLOAT, [2, 1, 1])
1712+
matmul2_node = onnx.helper.make_node(
1713+
"MatMul", ["gathernd_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2"
1714+
)
1715+
1716+
initializers = [matmul1_weight, indices, matmul2_weight]
1717+
graph = helper.make_graph(
1718+
[matmul1_node, gathernd_node, matmul2_node],
1719+
"TestGatherND_test_model",
1720+
[input_tensor],
1721+
[matmul2_output],
1722+
initializer=initializers,
1723+
)
1724+
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
1725+
model.ir_version = 7
1726+
1727+
q_config = {
1728+
"Matmul_0": self.static_q_config,
1729+
"Matmul_2": self.static_q_config,
1730+
"Gathernd_1": self.static_q_config,
1731+
}
1732+
1733+
quantize_params = {
1734+
"input": [np.uint8(10.0), np.float32(0)],
1735+
"matmul1_weight": [np.uint8(10.0), np.float32(0)],
1736+
"matmul1_output": [np.uint8(10.0), np.float32(0)],
1737+
"matmul2_weight": [np.uint8(10.0), np.float32(0)],
1738+
"matmul2_output": [np.uint8(10.0), np.float32(0)],
1739+
"gathernd_output": [np.uint8(10.0), np.float32(0)],
1740+
}
1741+
quantizable_op_types = ["MatMul", "GatherND"]
1742+
q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
1743+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1)
1744+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
1745+
session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
1746+
self.assertIsNotNone(session)
1747+
1748+
q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
1749+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6)
1750+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
1751+
session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
1752+
self.assertIsNotNone(session)
1753+
1754+
def test_gatherelements(self):
1755+
# test GatherElements nodes: MatMul-GatherElements-MatMul
1756+
input_tensor = helper.make_tensor_value_info("input", TensorProto.FLOAT, [3, 1])
1757+
1758+
matmul1_weight = helper.make_tensor(
1759+
"matmul1_weight", TensorProto.FLOAT, [1, 3], np.random.random((1, 3)).reshape(3).tolist()
1760+
)
1761+
matmul1_output = helper.make_tensor_value_info("matmul1_output", TensorProto.FLOAT, [3, 3])
1762+
matmul1_node = onnx.helper.make_node("MatMul", ["input", "matmul1_weight"], ["matmul1_output"], name="Matmul_0")
1763+
1764+
gatherelements_output = helper.make_tensor_value_info("gatherelements_output", TensorProto.FLOAT, [2, 3])
1765+
indices = helper.make_tensor("indices", TensorProto.INT64, [2, 3], [-1, -2, 0, -2, 0, 0])
1766+
gathernd_node = onnx.helper.make_node(
1767+
"GatherElements",
1768+
["matmul1_output", "indices"],
1769+
["gatherelements_output"],
1770+
name="Gatherelements_1",
1771+
)
1772+
1773+
matmul2_weight = helper.make_tensor(
1774+
"matmul2_weight", TensorProto.FLOAT, [3, 1], np.random.random((3, 1)).reshape(3).tolist()
1775+
)
1776+
matmul2_output = helper.make_tensor_value_info("matmul2_output", TensorProto.FLOAT, [2, 1])
1777+
matmul2_node = onnx.helper.make_node(
1778+
"MatMul", ["gatherelements_output", "matmul2_weight"], ["matmul2_output"], name="Matmul_2"
1779+
)
1780+
1781+
initializers = [matmul1_weight, indices, matmul2_weight]
1782+
graph = helper.make_graph(
1783+
[matmul1_node, gathernd_node, matmul2_node],
1784+
"TestGatherElements_test_model",
1785+
[input_tensor],
1786+
[matmul2_output],
1787+
initializer=initializers,
1788+
)
1789+
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
1790+
model.ir_version = 7
1791+
1792+
q_config = {
1793+
"Matmul_0": self.static_q_config,
1794+
"Matmul_2": self.static_q_config,
1795+
"Gatherelements_1": self.static_q_config,
1796+
}
1797+
1798+
quantize_params = {
1799+
"input": [np.uint8(10.0), np.float32(0)],
1800+
"matmul1_weight": [np.uint8(10.0), np.float32(0)],
1801+
"matmul1_output": [np.uint8(10.0), np.float32(0)],
1802+
"matmul2_weight": [np.uint8(10.0), np.float32(0)],
1803+
"matmul2_output": [np.uint8(10.0), np.float32(0)],
1804+
"gatherelements_output": [np.uint8(10.0), np.float32(0)],
1805+
}
1806+
quantizable_op_types = ["MatMul", "GatherElements"]
1807+
q_model = self.qlinear_test(model, q_config, quantize_params, quantizable_op_types)
1808+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 1)
1809+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 1)
1810+
session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
1811+
self.assertIsNotNone(session)
1812+
1813+
q_model = self.qdq_test(model, q_config, quantize_params, quantizable_op_types)
1814+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["DequantizeLinear"], 6)
1815+
self.assertEqual(Counter([node.op_type for node in q_model.model.graph.node])["QuantizeLinear"], 4)
1816+
session = ort.InferenceSession(q_model.model.SerializeToString(), providers=["CPUExecutionProvider"])
1817+
self.assertIsNotNone(session)
1818+
15651819

15661820
class TestCastONNXRT(unittest.TestCase):
15671821
@classmethod

0 commit comments

Comments
 (0)