Feat/quoted spans metric (#2311)

anistark · mohdibrahimaiml · web-flow · commit a6f76feb20d1 · 2025-09-24T16:54:01.000+05:30
## Issue Link / Problem Description  - contd #2237 --------- Co-authored-by: Mohd Ibrahim A. <106313018+mohdibrahimai@users.noreply.github.com>
diff --git a/docs/quoted_spans_metric.md b/docs/quoted_spans_metric.md
@@ -0,0 +1,33 @@
+## `citation_alignment_quoted_spans`
+
+**What:** A metric that measures the fraction of quoted spans in a model’s answer
+that appear verbatim in the retrieved sources.  The score is in the range
+[0, 1], where 1.0 indicates every quoted span is supported by evidence and 0.0
+indicates no quoted spans are found in the sources.
+
+**Why:** Users place extra trust in exact quotes.  When a model quotes facts
+that aren’t present in its evidence, it undermines reliability.  This metric
+helps catch cases of citation drift where quoted phrases in the answer are
+unsupported.
+
+**Input shape:**
+
+- `answers: List[str]` – list of model answers (length N)
+- `sources: List[List[str]]` – list (length N) of lists of source passages
+
+**Output:** A dictionary containing:
+
+```python
+{
+  "citation_alignment_quoted_spans": float,  # score in [0,1]
+  "matched": float,                          # number of spans found in sources
+  "total": float                            # total number of spans considered
+}
+```
+
+**Notes:**
+
+- The implementation normalizes text by collapsing whitespace and lower‑casing.
+- Spans shorter than three words are ignored by default; adjust `min_len` to change this.
+- If no quoted spans are found across all answers, the score is defined as 0.0 with
+  `total = 0`.
diff --git a/src/ragas/metrics/quoted_spans.py b/src/ragas/metrics/quoted_spans.py
@@ -0,0 +1,124 @@
+"""
+Quoted Spans Alignment Metric
+================================
+
+This module provides a simple metric to measure citation alignment for quoted spans
+in model-generated answers. The idea is to compute the fraction of quoted spans
+appearing verbatim in any of the provided source passages.  If an answer quotes
+facts that cannot be found in the sources, the metric will reflect that drift.
+
+The metric function is designed to be plug‑and‑play in existing evaluation
+pipelines.  It returns a score in the range [0, 1] along with the raw counts for
+matched and total quoted spans.  It performs light normalization by collapsing
+whitespace and lower‑casing strings.  You can adjust the minimum length of a
+quoted span and choose to disable case folding if desired.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Dict, Sequence
+
+# Regular expression to extract both straight and curly quoted spans.  Matches
+# pairs of quotes and captures the inner text.
+_QUOTE_RE = re.compile(r"[\"" "''`´](.*?)[\"" "''`´]")
+
+
+def _normalize(text: str) -> str:
+    """Normalize text by collapsing whitespace and lower‑casing it."""
+    return re.sub(r"\s+", " ", text).strip().lower()
+
+
+def _extract_quoted_spans(answer: str, *, min_len: int = 3) -> Sequence[str]:
+    """
+    Extract quoted spans from an answer.
+
+    Parameters
+    ----------
+    answer: str
+        The model answer to search for quoted spans.
+    min_len: int, optional
+        Minimum number of words required for a span to be considered.  Shorter
+        spans are ignored to avoid spurious matches.
+
+    Returns
+    -------
+    Sequence[str]
+        A list of quoted spans (strings) that meet the minimum length
+        requirement.
+    """
+    spans: list[str] = []
+    for match in _QUOTE_RE.finditer(answer):
+        span = (match.group(1) or "").strip()
+        # filter out spans shorter than min_len words
+        if len(span.split()) >= min_len:
+            spans.append(span)
+    return spans
+
+
+def quoted_spans_alignment(
+    answers: Sequence[str],
+    sources: Sequence[Sequence[str]],
+    *,
+    casefold: bool = True,
+    min_len: int = 3,
+) -> Dict[str, float]:
+    """
+    Compute the citation alignment score for quoted spans in model answers.
+
+    Parameters
+    ----------
+    answers: Sequence[str]
+        List of model answers (length N).
+    sources: Sequence[Sequence[str]]
+        List of lists (length N) containing passages for each answer.
+    casefold: bool, optional
+        Whether to normalize text by lower‑casing before matching.  Defaults
+        to True.
+    min_len: int, optional
+        Minimum number of words in a quoted span.  Defaults to 3.
+
+    Returns
+    -------
+    Dict[str, float]
+        A dictionary containing:
+            - "citation_alignment_quoted_spans": the fraction of quoted
+              spans found verbatim in the provided sources.
+            - "matched": number of spans that were matched
+            - "total": total number of spans considered
+
+    Notes
+    -----
+    If no quoted spans are found across the dataset, the score is defined as
+    0.0, with matched=0 and total=0.  Matching is substring matching on
+    normalized text.
+    """
+    if len(answers) != len(sources):
+        raise ValueError("answers and sources must have the same length")
+    matched = 0
+    total = 0
+
+    for answer, src_list in zip(answers, sources):
+        spans = _extract_quoted_spans(answer, min_len=min_len)
+        if not spans:
+            continue
+        # join all sources for this answer into one string
+        joined_sources = " ".join(src_list)
+        if casefold:
+            normalized_sources = _normalize(joined_sources)
+        else:
+            normalized_sources = joined_sources
+
+        for span in spans:
+            total += 1
+            span_norm = _normalize(span) if casefold else span
+            # check if the normalized span appears in the normalized sources
+            if span_norm and span_norm in normalized_sources:
+                matched += 1
+
+    score = (matched / total) if total else 0.0
+    return {
+        "citation_alignment_quoted_spans": float(score),
+        "matched": float(matched),
+        "total": float(total),
+    }
diff --git a/tests/test_quoted_spans.py b/tests/test_quoted_spans.py
@@ -0,0 +1,50 @@
+"""
+Unit tests for the quoted spans alignment metric.
+
+These tests are written using pytest and cover several common cases:
+    - A perfect match where the quoted span appears in the sources.
+    - A mismatch where the quoted span does not appear in the sources.
+    - Case and whitespace variations to verify normalization logic.
+    - Answers with no quoted spans to ensure the score is zero and total is zero.
+
+To run these tests, install pytest and run `pytest` in the repository root.
+"""
+
+from ragas.metrics.quoted_spans import quoted_spans_alignment
+
+
+def test_perfect_match():
+    """Quoted span matches exactly in the source."""
+    answers = ['Paris is "the capital of France".']
+    sources = [["The capital of France is Paris."]]
+    result = quoted_spans_alignment(answers, sources)
+    assert result["citation_alignment_quoted_spans"] == 1.0
+    assert result["matched"] == 1.0
+    assert result["total"] == 1.0
+
+
+def test_mismatch_detected():
+    """Quoted span does not appear in the sources."""
+    answers = ['GDP was "$2.9T" in 2023.']
+    sources = [["…GDP was $2.7T in 2023 per WB…"]]
+    result = quoted_spans_alignment(answers, sources, min_len=1)
+    assert result["citation_alignment_quoted_spans"] == 0.0
+    assert result["matched"] == 0.0
+    assert result["total"] == 1.0
+
+
+def test_mixed_case_and_whitespace():
+    """Matching should be case-insensitive and handle extra whitespace."""
+    answers = ['Result: "Delta   E    = mc  ^ 2".']
+    sources = [["…delta e = mc ^ 2 holds…"]]
+    result = quoted_spans_alignment(answers, sources)
+    assert result["citation_alignment_quoted_spans"] == 1.0
+
+
+def test_no_quotes_returns_zero_with_zero_denominator():
+    """An answer with no quoted spans should yield score 0.0 and total 0."""
+    answers = ["No quotes here."]
+    sources = [["Irrelevant."]]
+    result = quoted_spans_alignment(answers, sources)
+    assert result["citation_alignment_quoted_spans"] == 0.0
+    assert result["total"] == 0.0