diff --git a/examples/torch/classification/README.md b/examples/torch/classification/README.md
index 9b14683add4..fed496581bc 100644
--- a/examples/torch/classification/README.md
+++ b/examples/torch/classification/README.md
@@ -126,7 +126,7 @@ As an example of NNCF convolution binarization capabilities, you may use the con
 |ResNet-18|Filter pruning, 40%, magnitude criterion|ImageNet|69.26 (0.54)|2.75 (75.75%)|9.23 (79.02%)|[Link](configs/pruning/resnet18_pruning_magnitude.json)|[Link](https://storage.openvinotoolkit.org/repositories/nncf/models/develop/torch/resnet18_imagenet_filter_pruning_magnitude.pth)|
 |ResNet-18|Filter pruning, 40%, geometric median criterion|ImageNet|69.32 (0.48)|2.75 (75.75%)|9.23 (79.02%)|[Link](configs/pruning/resnet18_pruning_geometric_median.json)|[Link](https://storage.openvinotoolkit.org/repositories/nncf/models/develop/torch/resnet18_imagenet_filter_pruning_geomean.pth)|
 |ResNet-34|None|ImageNet|73.26|7.33 (100%)|21.78 (100%)|[Link](configs/pruning/resnet34_imagenet.json)|-|
-|ResNet-34|Filter pruning, 40%, geometric median criterion|ImageNet|72.72 (0.54)|5.06 (69.03%)|15.47 (71.03%)|[Link](configs/pruning/resnet34_pruning_geometric_median.json)|[Link](https://storage.openvinotoolkit.org/repositories/nncf/models/develop/torch/resnet34_imagenet_filter_pruning_geomean.pth)|
+|ResNet-34|Filter pruning, 50%, geometric median criterion + KD|ImageNet|73.11 (0.15)|4.32 (58.96%)|13.56 (62.25%)|[Link](configs/pruning/resnet34_pruning_geometric_median_kd.json)|[Link](https://storage.openvinotoolkit.org/repositories/nncf/models/develop/torch/resnet34_imagenet_filter_pruning_geomean_kd.pth)|
 |GoogLeNet|None|ImageNet|69.72|2.99 (100%)|6.61 (100%)|[Link](configs/pruning/googlenet_imagenet.json)|-|
 |GoogLeNet|Filter pruning, 40%, geometric median criterion|ImageNet|68.89 (0.83)|1.36 (45.48%)|3.47 (52.50%)|[Link](configs/pruning/googlenet_pruning_geometric_median.json)|[Link](https://storage.openvinotoolkit.org/repositories/nncf/models/develop/torch/googlenet_imagenet_filter_pruning_geomean.pth)|
 
diff --git a/examples/torch/classification/configs/pruning/resnet34_pruning_geometric_median.json b/examples/torch/classification/configs/pruning/resnet34_pruning_geometric_median.json
deleted file mode 100644
index c9a3c1ac5b1..00000000000
--- a/examples/torch/classification/configs/pruning/resnet34_pruning_geometric_median.json
+++ /dev/null
@@ -1,38 +0,0 @@
-{
-    "model": "resnet34",
-    "pretrained": true,
-    "batch_size" : 512,
-    "epochs": 100,
-    "input_info": {
-      "sample_size": [1, 3, 224, 224]
-    },
-    "optimizer": {
-        "type": "SGD",
-        "base_lr": 0.1,
-        "weight_decay": 1e-4,
-        "schedule_type": "multistep",
-        "steps": [
-            20,
-            40,
-            60,
-            80
-        ],
-        "optimizer_params":
-        {
-            "momentum": 0.9,
-            "nesterov": true
-        }
-    },
-   "compression": [
-       {
-            "algorithm": "filter_pruning",
-            "pruning_init": 0.1,
-            "params": {
-            "schedule": "exponential",
-                "pruning_target": 0.4,
-                "pruning_steps": 15,
-                "filter_importance": "geometric_median"
-            }
-        }
-   ]
-}
\ No newline at end of file
diff --git a/examples/torch/classification/configs/pruning/resnet34_pruning_geometric_median_kd.json b/examples/torch/classification/configs/pruning/resnet34_pruning_geometric_median_kd.json
index 6e5e3bc6b96..c8b27d2e736 100644
--- a/examples/torch/classification/configs/pruning/resnet34_pruning_geometric_median_kd.json
+++ b/examples/torch/classification/configs/pruning/resnet34_pruning_geometric_median_kd.json
@@ -12,9 +12,9 @@
         "weight_decay": 1e-4,
         "schedule_type": "multistep",
         "steps": [
-            20,
-            40,
-            60,
+            25,
+            45,
+            65,
             80
         ],
         "optimizer_params":
@@ -29,8 +29,8 @@
             "pruning_init": 0.1,
             "params": {
             "schedule": "exponential",
-                "pruning_target": 0.4,
-                "pruning_steps": 15,
+                "pruning_target": 0.5,
+                "pruning_steps": 20,
                 "filter_importance": "geometric_median"
             }
        },
@@ -39,4 +39,4 @@
             "type": "softmax" // or mse
        }
    ]
-}
\ No newline at end of file
+}
diff --git a/third_party_integration/huggingface_transformers/0001-Modifications-for-NNCF-usage.patch b/third_party_integration/huggingface_transformers/0001-Modifications-for-NNCF-usage.patch
index 21a237422ab..22797f933fa 100644
--- a/third_party_integration/huggingface_transformers/0001-Modifications-for-NNCF-usage.patch
+++ b/third_party_integration/huggingface_transformers/0001-Modifications-for-NNCF-usage.patch
@@ -1,6 +1,6 @@
-From 8e5d5b5798febec45bb442e96f2633ac1c2bf6ba Mon Sep 17 00:00:00 2001
-From: Vasily Shamporov <vasily.shamporov@intel.com>
-Date: Mon, 2 Aug 2021 17:46:48 +0300
+From d333a563fd28c538f93217af3d7556141d19d51b Mon Sep 17 00:00:00 2001
+From: skholkin <kholkinsd@gmail.com>
+Date: Fri, 4 Mar 2022 14:03:05 +0300
 Subject: [PATCH] Modifications for NNCF usage
 
 ---
@@ -12,9 +12,10 @@ Subject: [PATCH] Modifications for NNCF usage
  nncf_bert_config_conll.json                   |  44 +++++++
  nncf_bert_config_mrpc.json                    |  42 +++++++
  nncf_bert_config_squad.json                   |  44 +++++++
+ nncf_bert_config_squad_kd.json                |  50 ++++++++
  ...config_squad_magnitude_sparsity_cubic.json |  31 +++++
  nncf_bert_config_xnli.json                    |  36 ++++++
- nncf_distilbert_config_sst2.json              |  33 +++++
+ nncf_distilbert_config_sst2.json              |  34 +++++
  nncf_gpt2_config_wikitext_hw_config.json      |  58 +++++++++
  nncf_mobilebert_config_squad_int8.json        |  46 +++++++
  nncf_roberta_config_mnli.json                 |  36 ++++++
@@ -23,10 +24,11 @@ Subject: [PATCH] Modifications for NNCF usage
  src/transformers/trainer.py                   |  74 +++++++++--
  src/transformers/trainer_callback.py          |   2 +
  src/transformers/training_args.py             |   6 +
- 19 files changed, 842 insertions(+), 73 deletions(-)
+ 20 files changed, 893 insertions(+), 73 deletions(-)
  create mode 100644 nncf_bert_config_conll.json
  create mode 100644 nncf_bert_config_mrpc.json
  create mode 100644 nncf_bert_config_squad.json
+ create mode 100644 nncf_bert_config_squad_kd.json
  create mode 100644 nncf_bert_config_squad_magnitude_sparsity_cubic.json
  create mode 100644 nncf_bert_config_xnli.json
  create mode 100644 nncf_distilbert_config_sst2.json
@@ -851,6 +853,62 @@ index 000000000..12e0440f7
 +        }
 +    }
 +}
+diff --git a/nncf_bert_config_squad_kd.json b/nncf_bert_config_squad_kd.json
+new file mode 100644
+index 000000000..f5872a0a2
+--- /dev/null
++++ b/nncf_bert_config_squad_kd.json
+@@ -0,0 +1,50 @@
++{
++    "input_info": [
++        {
++            "sample_size": [1, 384],
++            "type": "long"
++        },
++        {
++            "sample_size": [1, 384],
++            "type": "long"
++        },
++        {
++            "sample_size": [1, 384],
++            "type": "long"
++        }
++    ],
++    "compression": [{
++        "algorithm": "quantization",
++        "initializer": {
++            "range": {
++                "num_init_samples": 32,
++                "type": "percentile",
++                "params":
++                {
++                    "min_percentile": 0.01,
++                    "max_percentile": 99.99
++                }
++            },
++
++            "batchnorm_adaptation": {
++                "num_bn_adaptation_samples": 200
++            }
++        },
++        "activations":
++        {
++            "mode": "symmetric"
++        },
++        "weights":
++        {
++            "mode": "symmetric",
++            "signed": true,
++            "per_channel": false
++        }
++    },
++    {
++        "algorithm": "knowledge_distillation",
++        "type": "softmax",
++        "temperature": 3
++    }
++    ]
++}
 diff --git a/nncf_bert_config_squad_magnitude_sparsity_cubic.json b/nncf_bert_config_squad_magnitude_sparsity_cubic.json
 new file mode 100644
 index 000000000..b4452e8d4
@@ -932,10 +990,10 @@ index 000000000..a21a522fc
 +}
 diff --git a/nncf_distilbert_config_sst2.json b/nncf_distilbert_config_sst2.json
 new file mode 100644
-index 000000000..6b648ca5e
+index 000000000..868735016
 --- /dev/null
 +++ b/nncf_distilbert_config_sst2.json
-@@ -0,0 +1,33 @@
+@@ -0,0 +1,34 @@
 +{
 +    "input_info": [
 +        {
@@ -951,7 +1009,8 @@ index 000000000..6b648ca5e
 +        "algorithm": "quantization",
 +        "initializer": {
 +            "range": {
-+                "num_init_samples": 32
++                "num_init_samples": 32,
++                "type": "mean_percentile"
 +            }
 +        },
 +        "ignored_scopes": [
diff --git a/third_party_integration/huggingface_transformers/README.md b/third_party_integration/huggingface_transformers/README.md
index cad7458d0b4..8d2b6e66a57 100644
--- a/third_party_integration/huggingface_transformers/README.md
+++ b/third_party_integration/huggingface_transformers/README.md
@@ -43,12 +43,17 @@ _INT8 model (symmetric weights, asymmetric activations quantization)_ - 77.22% a
 
 _Full-precision FP32 baseline model_ - bert-large-uncased-whole-word-masking model, trained on SQuAD v1.1 - 93.21% F1, 87.2% EM on the dev set,
 
-_INT8 model (symmetric quantization)_ - 92.60% F1, 86.36% EM on the dev set.
+_INT8 model (symmetric quantization)_ - 92.55% F1, 86.1% EM on the dev set.
 
 **INT8 model quantization-aware training command line (trained on 4x Tesla V100):**
 
-`python examples/pytorch/question-answering/run_qa.py --model_name_or_path bert-large-uncased-whole-word-masking --do_train --do_eval --dataset_name squad --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 384 --doc_stride 128 --output_dir bert_squad_int8 --per_gpu_eval_batch_size=1 --per_gpu_train_batch_size=6 --save_steps=400 --nncf_config nncf_bert_config_squad.json`
+`python examples/pytorch/question-answering/run_qa.py --model_name_or_path bert-large-uncased-whole-word-masking --do_train --do_eval --dataset_name squad --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 384 --doc_stride 128 --output_dir bert_squad_int8 --per_gpu_eval_batch_size=1 --per_gpu_train_batch_size=10 --save_steps=400 --nncf_config nncf_bert_config_squad.json`
 
+_INT8 model (symmetric quantization) + Knowledge Distillation_ - 92.89% F1, 86.68% EM on the dev set.
+
+**INT8 model quantization-aware training + Knowledge Distillation command line (trained on 4x Tesla V100):**
+
+`python examples/pytorch/question-answering/run_qa.py --model_name_or_path bert-large-uncased-whole-word-masking --do_train --do_eval --dataset_name squad --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 384 --doc_stride 128 --output_dir bert_squad_int8 --per_gpu_eval_batch_size=1 --per_gpu_train_batch_size=10 --save_steps=400 --nncf_config nncf_bert_config_squad_kd.json`
 
 **Fine-tuned INT8 model evaluation and ONNX export command line:**
 
@@ -105,7 +110,7 @@ _INT8 model (asymmetrically quantized)_ - 89.25% accuracy (matched), 88.9% accur
 
 _Full-precision FP32 baseline model_ - distilbert-base-uncased-finetuned-sst-2-english, pre-trained on SST-2 - 91.1% accuracy
 
-_INT8 model (symmetrically quantized)_ - 90.3% accuracy
+_INT8 model (symmetrically quantized)_ - 90.94% accuracy
 
 **INT8 model quantization-aware training command line:**