Merge branch 'openvinotoolkit:develop' into develop

anzr299 · Sep 2, 2024 · 2665666 · 2665666
2 parents e7097bd + 03f65a7
commit 2665666
Show file tree

Hide file tree

Showing 97 changed files with 55,669 additions and 13,450 deletions.
diff --git a/docs/ModelZoo.md b/docs/ModelZoo.md
@@ -1,8 +1,10 @@
 # NNCF Compressed Model Zoo
 
-Here we present the results achieved using our sample scripts, example patches to third-party repositories and NNCF configuration files.
+Ready-to-use **Compressed LLMs** can be found on [OpenVINO Hugging Face page](https://huggingface.co/OpenVINO#models). Each model card includes NNCF parameters that were used to compress the model.
 
-The applied quantization compression algorithms are divided into two broad categories: Quantization-Aware Training ([QAT](../README.md#training-time-compression)) and Post-Training Quantization ([PTQ](../README.md#post-training-quantization)). Here we mainly report the QAT results and the PTQ results may be found on an OpenVino Performance Benchmarks [page](https://docs.openvino.ai/2024/about-openvino/performance-benchmarks.html).
+**INT8 Post-Training Quantization** ([PTQ](../README.md#post-training-quantization)) results for public Vision, NLP and GenAI models can be found on [OpenVino Performance Benchmarks page](https://docs.openvino.ai/2024/about-openvino/performance-benchmarks.html). PTQ results for ONNX models are available in the [ONNX](#onnx) section below.
+
+**Quantization-Aware Training** ([QAT](../README.md#training-time-compression)) results for PyTorch and TensorFlow public models can be found below.
 
 - [PyTorch](#pytorch)
   - [Classification](#pytorch-classification)

diff --git a/nncf/common/quantization/quantizer_propagation/solver.py b/nncf/common/quantization/quantizer_propagation/solver.py
@@ -468,7 +468,9 @@ def _filter_by_weight_ignored_target_scopes(
                 nncf_logger.debug(f"Ignored adding weight quantizer for: {node_name}")
         return weight_quantizable_node_names_vs_qconfigs
 
-    def run_on_ip_graph(self, ip_graph: InsertionPointGraph) -> QuantizationProposal:
+    def run_on_ip_graph(
+        self, ip_graph: InsertionPointGraph, metatypes_for_filter: Optional[List[OperatorMetatype]] = None
+    ) -> QuantizationProposal:
         """
         The main function to be used on an InsertionPointGraph to produce
         the list of insertion commands and configs corresponding to the desired quantized
@@ -479,6 +481,7 @@ def run_on_ip_graph(self, ip_graph: InsertionPointGraph) -> QuantizationProposal
         :param ip_graph: The InsertionPointGraph, potentially with fused operations w.r.t. the
         original model graph. The propagating quantizers will travel along the pre- and post-
         hook nodes registered in this graph.
+        :param metatypes_for_filter: Metatypes are used for the removal criterion.
         :return: The intermediate propagation state in the form of QuantizationProposal, which
         defines unambiguously the locations of the propagating quantizers, but not the final
         configurations.
@@ -513,6 +516,8 @@ def run_on_ip_graph(self, ip_graph: InsertionPointGraph) -> QuantizationProposal
             iteration_counter += 1
 
         quant_prop_graph = self._filter_integer_input_quantizers(quant_prop_graph)
+        if metatypes_for_filter:
+            quant_prop_graph = self._filter_quantizers_by_metatypes(quant_prop_graph, metatypes_for_filter)
 
         if self._visualizer is not None:
             self._visualizer.visualize_quantizer_propagation(self, quant_prop_graph, "proposed")
@@ -1597,3 +1602,66 @@ def _filter_integer_input_quantizers(
             quant_prop_graph.remove_propagating_quantizer(integer_input_pq)
 
         return quant_prop_graph
+
+    def _filter_quantizers_by_metatypes(
+        self, quant_prop_graph: QuantizerPropagationStateGraph, metatypes: List[OperatorMetatype]
+    ) -> QuantizerPropagationStateGraph:
+        """
+        Removes quantizers for which _is_quantizer_to_remove returns True.
+
+        :param quant_prop_graph: The quantizer propagation state graph.
+        :param metatypes: Metatypes are used for the removal criterion.
+        :return: Filtered quantizer propagation state graph.
+        """
+
+        def _is_quantizer_to_remove(
+            quant_prop_graph: QuantizerPropagationStateGraph,
+            quantizer: PropagatingQuantizer,
+            metatypes: List[OperatorMetatype],
+        ) -> bool:
+            """
+            Returns True if the quantizer meets the criteria for removal. The criteria are as follows:
+            1. The quantizer is generated from a node whose metatype is in the provided metatypes.
+            2. The quantizer is not propagated.
+            3. The quantizer has only one child.
+            4. The quantized node generates only one activation quantizer.
+            The function relies on the fact that considered metatypes should have two inputs.
+            In that case, if considered node at InsertionPointGraph has only one input,
+            it means that the another one is a constant.
+
+            :param quant_prop_graph: The quantizer propagation state graph holding the `quantizer`.
+            :param quantizer: The propagating quantizer to be currently considered.
+            :param metatypes: Metatypes are used for the criterion.
+            :return: True if quantizer satisfies the criteria, otherwise - False.
+            """
+            quantizer_children = quantizer.quantized_input_sink_operator_nodes
+            quantized_node_metatype = quant_prop_graph.nodes[quantized_node_key][
+                QuantizerPropagationStateGraph.OPERATOR_METATYPE_NODE_ATTR
+            ]
+            quantizers_generated_for_node = quant_prop_graph.nodes[quantized_node_key][
+                quant_prop_graph.AFFECTING_PROPAGATING_QUANTIZERS_ATTR
+            ]
+
+            is_one_quantizer_generated_for_node = len(quantizers_generated_for_node) == 1
+            is_one_child = len(quantizer_children) == 1
+            is_metatype_to_filter = quantized_node_metatype in metatypes
+            is_quantizer_not_propagated = len(quantizer.propagation_path) <= 1
+
+            return (
+                is_one_child
+                and is_metatype_to_filter
+                and is_one_quantizer_generated_for_node
+                and is_quantizer_not_propagated
+            )
+
+        quantizers = self._finished_propagating_quantizers
+        to_remove_quantizers = []
+        for quantizer in quantizers:
+            quantized_node_key = next(iter(quantizer.quantized_input_sink_operator_nodes))
+            if _is_quantizer_to_remove(quant_prop_graph, quantizer, metatypes):
+                nncf_logger.debug(f"Quantizer generated for a node {quantized_node_key} will be removed.")
+                to_remove_quantizers.append(quantizer)
+        for quantizer in to_remove_quantizers:
+            quant_prop_graph.remove_propagating_quantizer(quantizer)
+            self._finished_propagating_quantizers.remove(quantizer)
+        return quant_prop_graph
diff --git a/nncf/experimental/torch/fx/groups.py b/nncf/experimental/torch/fx/groups.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nncf.torch.graph.operator_metatypes as om
+from nncf.torch.model_graph_manager import OPERATORS_WITH_BIAS_METATYPES
+
+FX_OPERATORS_WITH_BIAS_METATYPES = tuple(OPERATORS_WITH_BIAS_METATYPES) + (om.PTLinearMetatype,)
diff --git a/nncf/experimental/torch/fx/model_transformer.py b/nncf/experimental/torch/fx/model_transformer.py
@@ -10,11 +10,10 @@
 # limitations under the License.
 
 from collections import defaultdict
-from typing import List
+from typing import List, Set
 
 import torch
 import torch.fx
-from torch.fx.passes.split_utils import split_by_tags
 
 from nncf.common.graph.model_transformer import ModelTransformer
 from nncf.experimental.torch.fx.commands import FXApplyTransformationCommand
@@ -26,6 +25,8 @@
 class FXModelTransformer(ModelTransformer):
     """
     Applies transformations upon Torch FX model.
+    FXApplyTransformationCommands are made inplace,
+    PTModelExtractionCommands do not change the input model.
     """
 
     def __init__(self, model: torch.fx.GraphModule):
@@ -61,6 +62,31 @@ def transform(self, transformation_layout: PTTransformationLayout) -> torch.fx.G
         model.recompile()
         return model
 
+    @staticmethod
+    def _traverse_graph(
+        input_nodes: List[torch.fx.Node],
+        stop_nodes: Set[torch.fx.Node],
+        visited: Set[torch.fx.Node],
+    ) -> None:
+        """
+        Traverses through the graph starting with the input nodes and
+        stopping for the stop nodes and the visited nodes. As the result,
+        it modifies the visited container with all nodes visited during the traverse.
+
+        :param input_nodes: Given input nodes.
+        :param stop_nodes: Given stop nodes.
+        :param visited: Set of already visited nodes.
+        """
+
+        while input_nodes:
+            in_node = input_nodes.pop()
+            if in_node.name in visited or in_node.name in stop_nodes:
+                continue
+
+            visited.add(in_node.name)
+            input_nodes.extend(in_node.all_input_nodes)
+            input_nodes.extend(list(in_node.users))
+
     @staticmethod
     def _apply_model_extraction(
         model: torch.fx.GraphModule,
@@ -75,46 +101,63 @@ def _apply_model_extraction(
             more than one element this function raises an assert.
         :return: Returns a submodel extracted from the given model by the given transformation.
         """
+
         transformation = transformations[-1]
-        assert len(transformation.input_node_names) == 1
-        assert transformation.input_node_names == transformation.output_node_names
-        node_name = transformation.input_node_names[0]
+        stop_nodes = set(transformation.input_node_names + transformation.output_node_names)
+        visited = set()
+
+        for node_name in transformation.input_node_names:
+            node = get_graph_node_by_name(model.graph, node_name)
+            visited.add(node.name)
+            target_inputs = node.all_input_nodes[1:]
+            if node.name not in transformation.output_node_names:
+                target_inputs += list(node.users)
+            FXModelTransformer._traverse_graph(target_inputs, stop_nodes, visited)
+
+        for node_name in transformation.output_node_names:
+            node = get_graph_node_by_name(model.graph, node_name)
+            visited.add(node.name)
+            if node.name not in transformation.input_node_names:
+                FXModelTransformer._traverse_graph(node.all_input_nodes, stop_nodes, visited)
+
+        extracted_graph = torch.fx.Graph()
+        value_remap = {}
+
+        def remap_fn(node: torch.fx.Node):
+            return value_remap.get(node)  # noqa F821
 
-        tags = ["before", "extracted", "after"]
-        i = 0
         for node in model.graph.nodes:
-            if node.name == node_name:
-                node.tag = tags[1]
-                weights = [node.all_input_nodes[1]]
-                while weights:
-                    w_node = weights.pop()
-                    assert w_node.tag in tags[0:2]
-                    w_node.tag = tags[1]
-                    weights.extend(w_node.all_input_nodes)
-                i = 2
+            if node.name not in visited or node.op == "output":
                 continue
-            node.tag = tags[i]
-
-        # TODO(dlyakhov): reduce memory consumption by
-        # more optimal splitting implementation.
-        splitted_gm = split_by_tags(model, tags)
-
-        extracted_model = splitted_gm.extracted
-        graph: torch.fx.Graph = extracted_model.graph
-        # Check extracted model has inputs.
-        # It is possible to have two constant inputs
-        # for the target layer, an placeholder is being
-        # placed to the input port.
-        target_node = get_graph_node_by_name(graph, node_name)
-        input_node = target_node.all_input_nodes[0]
-        if input_node.op != "placeholder":
-            with graph.inserting_before(target_node):
-                new_input_node = graph.create_node(
-                    "placeholder", "placeholder_node", (), {}, name="placeholder_graph_node"
+            value_remap[node] = extracted_graph.node_copy(node, remap_fn)
+        del value_remap
+
+        for input_name in transformation.input_node_names:
+            node_with_input = get_graph_node_by_name(extracted_graph, input_name)
+            with extracted_graph.inserting_before(node_with_input):
+                graph_input_name = input_name + "_input"
+                graph_input = extracted_graph.create_node(
+                    op="placeholder",
+                    target=graph_input_name,
+                    name=graph_input_name,
                 )
-            target_node.replace_input_with(input_node, new_input_node)
-        extracted_model.graph.eliminate_dead_code()
-        return extracted_model
+
+            args = list(node_with_input.args)
+            args[0] = graph_input
+            node_with_input.args = tuple(args)
+
+        nodes_with_output = [get_graph_node_by_name(extracted_graph, name) for name in transformation.output_node_names]
+        last_node = list(extracted_graph.nodes)[-1]
+        with extracted_graph.inserting_after(last_node):
+            graph_output_name = "output"
+            extracted_graph.create_node(
+                "output",
+                graph_output_name,
+                (tuple(nodes_with_output),),
+                name=graph_output_name,
+            )
+
+        return torch.fx.GraphModule(model, extracted_graph)
 
     @staticmethod
     def _apply_transformation(

diff --git a/nncf/experimental/torch/fx/model_utils.py b/nncf/experimental/torch/fx/model_utils.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import deque
+
+import torch.fx
+
+from nncf.common.factory import ModelTransformerFactory
+from nncf.common.graph.definitions import NNCFGraphNodeType
+from nncf.common.graph.graph import NNCFGraph
+from nncf.common.graph.transformations.commands import TargetType
+from nncf.common.graph.transformations.layout import TransformationLayout
+from nncf.experimental.torch.fx.commands import FXApplyTransformationCommand
+from nncf.experimental.torch.fx.transformations import node_removal_transformation_builder
+from nncf.torch.graph.operator_metatypes import QUANTIZE_NODE_TYPES
+from nncf.torch.graph.transformations.commands import PTTargetPoint
+
+
+def remove_fq_from_inputs(model: torch.fx.GraphModule, graph: NNCFGraph) -> torch.fx.GraphModule:
+    """
+    This method removes the activation Fake Quantize nodes from the model.
+    It's needed for the further bias shift calculation that relates on quantized weights.
+
+    :param model: ov.Model instance.
+    :param graph: NNCFGraph instance.
+    :return: ov.Model instance without activation Fake Quantize nodes.
+    """
+    transformation_layout = TransformationLayout()
+    model_transformer = ModelTransformerFactory.create(model)
+
+    seen_nodes = []
+    nodes_queue = deque(graph.get_input_nodes())
+    while nodes_queue:
+        current_node = nodes_queue.popleft()
+        current_node_name = current_node.node_name
+
+        if current_node_name in seen_nodes:
+            continue
+
+        seen_nodes.append(current_node_name)
+        if current_node.node_type in QUANTIZE_NODE_TYPES:
+            transformation = node_removal_transformation_builder(current_node, input_port_id=0)
+            transformation_layout.register(FXApplyTransformationCommand(transformation))
+        nodes_queue.extend(graph.get_next_nodes(current_node))
+
+    return model_transformer.transform(transformation_layout)
+
+
+_TARGET_TYPE_TO_FX_INS_TYPE_MAP = {
+    TargetType.PRE_LAYER_OPERATION: TargetType.OPERATOR_PRE_HOOK,
+    TargetType.POST_LAYER_OPERATION: TargetType.OPERATOR_POST_HOOK,
+}
+
+
+def get_target_point(target_type: TargetType, target_node_name: str, port_id: int) -> PTTargetPoint:
+    """
+    Creates torch-specific target point.
+
+    :param target_type: Target point target type.
+    :param target_node_name: Target node name to use in the target point.
+    :param port_id: Target port id.
+    :return: Torch-specific target point.
+    """
+    if NNCFGraphNodeType.INPUT_NODE in target_node_name or target_type == TargetType.POST_LAYER_OPERATION:
+        port_id = None
+    if target_type in _TARGET_TYPE_TO_FX_INS_TYPE_MAP:
+        target_type = _TARGET_TYPE_TO_FX_INS_TYPE_MAP[target_type]
+    return PTTargetPoint(target_type, target_node_name, input_port_id=port_id)