diff --git a/tests/test_model.py b/tests/test_model.py
index b8b6366fb7..1cad36a8db 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -415,64 +415,6 @@ def test_against_hf_mixtral():
     torch.testing.assert_close(ours_y, theirs_y)
 
 
-@torch.inference_mode()
-@pytest.mark.parametrize(
-    ("device", "dtype"),
-    [
-        (torch.device("cpu"), torch.float32),
-        pytest.param(
-            torch.device("cuda"),
-            torch.float16,
-            marks=[
-                # the reference does softmax upscaled to fp32 during attention. additionally, the final layernorm input
-                # is slightly different
-                pytest.mark.xfail(raises=AssertionError, strict=False),
-                RunIf(min_cuda_gpus=1),
-            ],
-        ),
-    ],
-)
-def test_against_hf_h2o_danube(device, dtype):
-    torch.set_default_dtype(dtype)
-
-    ours_config = Config.from_name(
-        "Danube2-1.8b-chat",
-        padded_vocab_size=10000,
-        n_layer=2,
-        n_embd=16,
-        n_head=8,
-        n_query_groups=2,
-        intermediate_size=43,
-    )
-    T = 5
-    theirs_config = MistralConfig(
-        vocab_size=ours_config.padded_vocab_size,
-        hidden_size=ours_config.n_embd,
-        num_attention_heads=ours_config.n_head,
-        num_hidden_layers=ours_config.n_layer,
-        intermediate_size=ours_config.intermediate_size,
-        max_position_embeddings=T,
-        rms_norm_eps=ours_config.norm_eps,
-        num_key_value_heads=ours_config.n_query_groups,
-        rope_theta=ours_config.rope_base,
-    )
-    assert ours_config.intermediate_size == theirs_config.intermediate_size
-
-    theirs_model = MistralForCausalLM(theirs_config).to(device)
-    theirs_state_dict = theirs_model.state_dict()
-    state_dict = {}
-    copy_weights_hf_llama(ours_config, {}, state_dict, theirs_state_dict)
-    ours_model = GPT(ours_config).to(device)
-    ours_model.load_state_dict(state_dict)
-
-    # test end to end
-    x = torch.tensor([[9856, 23, 491, 1536, 304]], dtype=torch.int32, device=device)
-    assert x.size(1) == T
-    ours_y = ours_model(x)
-    theirs_y = theirs_model(x)["logits"].to(dtype)  # HF converts logits to float
-    torch.testing.assert_close(ours_y, theirs_y)
-
-
 @torch.inference_mode()
 @pytest.mark.parametrize(
     ("device", "dtype"),