Azure · Copilot · Dec 9, 2025 · Dec 9, 2025 · Dec 9, 2025 · Dec 9, 2025
@@ -35,6 +35,9 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
     :type similarity_threshold: int
     :param f1_score_threshold: The threshold for F1 score evaluation. Default is 0.5.
     :type f1_score_threshold: float
+    :keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
+        This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
+    :paramtype is_reasoning_model: bool
     :return: A callable class that evaluates and generates metrics for "question-answering" scenario.
     :param kwargs: Additional arguments to pass to the evaluator.
     :type kwargs: Any
@@ -87,6 +90,7 @@ def __init__(
         fluency_threshold: int = 3,
         similarity_threshold: int = 3,
         f1_score_threshold: float = 0.5,
+        is_reasoning_model: bool = False,
         **kwargs,
     ):
         # Type checking
@@ -102,11 +106,31 @@ def __init__(
                 raise TypeError(f"{name} must be an int or float, got {type(value)}")
 
         evaluators = [
-            GroundednessEvaluator(model_config, threshold=groundedness_threshold),
-            RelevanceEvaluator(model_config, threshold=relevance_threshold),
-            CoherenceEvaluator(model_config, threshold=coherence_threshold),
-            FluencyEvaluator(model_config, threshold=fluency_threshold),
-            SimilarityEvaluator(model_config, threshold=similarity_threshold),
+            GroundednessEvaluator(
+                model_config,
+                threshold=groundedness_threshold,
+                is_reasoning_model=is_reasoning_model,
+            ),
+            RelevanceEvaluator(
+                model_config,
+                threshold=relevance_threshold,
+                is_reasoning_model=is_reasoning_model,
+            ),
+            CoherenceEvaluator(
+                model_config,
+                threshold=coherence_threshold,
+                is_reasoning_model=is_reasoning_model,
+            ),
+            FluencyEvaluator(
+                model_config,
+                threshold=fluency_threshold,
+                is_reasoning_model=is_reasoning_model,
+            ),
+            SimilarityEvaluator(
+                model_config,
+                threshold=similarity_threshold,
+                is_reasoning_model=is_reasoning_model,
+            ),
             F1ScoreEvaluator(threshold=f1_score_threshold),
         ]
         super().__init__(evaluators=evaluators, **kwargs)

@@ -9,6 +9,7 @@
     RetrievalEvaluator,
     RelevanceEvaluator,
     GroundednessEvaluator,
+    QAEvaluator,
 )
 
 
@@ -243,3 +244,19 @@ def test_groundedness_evaluator_missing_required_inputs(self, mock_model_config)
             "Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required."
             in exc_info.value.args[0]
         )
+
+    def test_qa_evaluator_is_reasoning_model_default(self, mock_model_config):
+        """Test QAEvaluator initializes with is_reasoning_model defaulting to False"""
+        qa_eval = QAEvaluator(model_config=mock_model_config)
+        # Check that all model-based evaluators have is_reasoning_model set to False
+        for evaluator in qa_eval._evaluators:
+            if hasattr(evaluator, '_is_reasoning_model'):
+                assert evaluator._is_reasoning_model is False
+
+    def test_qa_evaluator_is_reasoning_model_true(self, mock_model_config):
+        """Test QAEvaluator properly passes is_reasoning_model=True to sub-evaluators"""
+        qa_eval = QAEvaluator(model_config=mock_model_config, is_reasoning_model=True)
+        # Check that all model-based evaluators have is_reasoning_model set to True
+        for evaluator in qa_eval._evaluators:
+            if hasattr(evaluator, '_is_reasoning_model'):
+                assert evaluator._is_reasoning_model is True