Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ class QAEvaluator(MultiEvaluatorBase[Union[str, float]]):
:type similarity_threshold: int
:param f1_score_threshold: The threshold for F1 score evaluation. Default is 0.5.
:type f1_score_threshold: float
:keyword is_reasoning_model: If True, the evaluator will use reasoning model configuration (o1/o3 models).
This will adjust parameters like max_completion_tokens and remove unsupported parameters. Default is False.
:paramtype is_reasoning_model: bool
:return: A callable class that evaluates and generates metrics for "question-answering" scenario.
:param kwargs: Additional arguments to pass to the evaluator.
:type kwargs: Any
Expand Down Expand Up @@ -87,6 +90,7 @@ def __init__(
fluency_threshold: int = 3,
similarity_threshold: int = 3,
f1_score_threshold: float = 0.5,
is_reasoning_model: bool = False,
**kwargs,
):
# Type checking
Expand All @@ -102,11 +106,31 @@ def __init__(
raise TypeError(f"{name} must be an int or float, got {type(value)}")

evaluators = [
GroundednessEvaluator(model_config, threshold=groundedness_threshold),
RelevanceEvaluator(model_config, threshold=relevance_threshold),
CoherenceEvaluator(model_config, threshold=coherence_threshold),
FluencyEvaluator(model_config, threshold=fluency_threshold),
SimilarityEvaluator(model_config, threshold=similarity_threshold),
GroundednessEvaluator(
model_config,
threshold=groundedness_threshold,
is_reasoning_model=is_reasoning_model,
),
RelevanceEvaluator(
model_config,
threshold=relevance_threshold,
is_reasoning_model=is_reasoning_model,
),
CoherenceEvaluator(
model_config,
threshold=coherence_threshold,
is_reasoning_model=is_reasoning_model,
),
FluencyEvaluator(
model_config,
threshold=fluency_threshold,
is_reasoning_model=is_reasoning_model,
),
SimilarityEvaluator(
model_config,
threshold=similarity_threshold,
is_reasoning_model=is_reasoning_model,
),
F1ScoreEvaluator(threshold=f1_score_threshold),
]
super().__init__(evaluators=evaluators, **kwargs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
RetrievalEvaluator,
RelevanceEvaluator,
GroundednessEvaluator,
QAEvaluator,
)


Expand Down Expand Up @@ -243,3 +244,19 @@ def test_groundedness_evaluator_missing_required_inputs(self, mock_model_config)
"Either 'conversation' or individual inputs must be provided. For Agent groundedness 'query' and 'response' are required."
in exc_info.value.args[0]
)

def test_qa_evaluator_is_reasoning_model_default(self, mock_model_config):
"""Test QAEvaluator initializes with is_reasoning_model defaulting to False"""
qa_eval = QAEvaluator(model_config=mock_model_config)
# Check that all model-based evaluators have is_reasoning_model set to False
for evaluator in qa_eval._evaluators:
if hasattr(evaluator, '_is_reasoning_model'):
assert evaluator._is_reasoning_model is False

def test_qa_evaluator_is_reasoning_model_true(self, mock_model_config):
"""Test QAEvaluator properly passes is_reasoning_model=True to sub-evaluators"""
qa_eval = QAEvaluator(model_config=mock_model_config, is_reasoning_model=True)
# Check that all model-based evaluators have is_reasoning_model set to True
for evaluator in qa_eval._evaluators:
if hasattr(evaluator, '_is_reasoning_model'):
assert evaluator._is_reasoning_model is True
Loading