Judge API 參考 (Evaluation)

`boring.judge`

Judge Package - V10.25 Advanced Evaluation

Provides LLM-as-a-Judge evaluation capabilities including: - Code quality grading with customizable rubrics - Pairwise comparison with position bias detection - Evaluation metrics (Kappa, Spearman, F1) - Bias monitoring and reporting

`BiasMonitor`

Monitors systematic biases in LLM evaluation over time.

Features: - Track pairwise comparison outcomes - Detect position bias (first-position preference) - Detect length bias (longer responses get higher scores) - Generate bias reports with recommendations

Source code in src/boring/judge/bias_monitor.py

class BiasMonitor:
    """
    Monitors systematic biases in LLM evaluation over time.

    Features:
    - Track pairwise comparison outcomes
    - Detect position bias (first-position preference)
    - Detect length bias (longer responses get higher scores)
    - Generate bias reports with recommendations
    """

    def __init__(self, project_root: Path):
        """
        Initialize bias monitor.

        Args:
            project_root: Project root for database storage
        """
        self.project_root = Path(project_root)
        self.db_path = self.project_root / ".boring_memory" / "bias_monitor.db"
        self.db_path.parent.mkdir(parents=True, exist_ok=True)
        self._init_db()

    def _get_connection(self) -> sqlite3.Connection:
        """Get thread-local connection."""
        if not hasattr(_local, "bias_conn"):
            conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
            conn.row_factory = sqlite3.Row
            _local.bias_conn = conn
        return _local.bias_conn

    def _init_db(self):
        """Initialize database schema."""
        conn = self._get_connection()
        conn.executescript("""
            -- Pairwise comparison tracking
            CREATE TABLE IF NOT EXISTS pairwise_evaluations (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                evaluation_id TEXT UNIQUE,
                winner TEXT NOT NULL,  -- 'A', 'B', 'TIE'
                first_position TEXT NOT NULL,  -- Which was in first position
                first_position_won INTEGER,  -- 1 if first position won
                position_consistent INTEGER,  -- 1 if both passes agreed
                confidence REAL,
                response_a_length INTEGER,
                response_b_length INTEGER,
                created_at TEXT NOT NULL
            );

            -- Direct scoring tracking
            CREATE TABLE IF NOT EXISTS direct_evaluations (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                evaluation_id TEXT UNIQUE,
                score REAL NOT NULL,
                response_length INTEGER,
                dimension_scores TEXT,  -- JSON
                created_at TEXT NOT NULL
            );

            -- Indices
            CREATE INDEX IF NOT EXISTS idx_pairwise_created ON pairwise_evaluations(created_at);
            CREATE INDEX IF NOT EXISTS idx_direct_created ON direct_evaluations(created_at);
        """)
        conn.commit()

    def record_pairwise_evaluation(
        self,
        evaluation_id: str,
        winner: str,
        first_position: str,
        position_consistent: bool,
        confidence: float = 0.0,
        response_a_length: int = 0,
        response_b_length: int = 0,
    ):
        """
        Record a pairwise comparison result.

        Args:
            evaluation_id: Unique identifier for this evaluation
            winner: Winner of comparison ('A', 'B', 'TIE')
            first_position: Which response was in first position ('A' or 'B')
            position_consistent: Whether both position passes agreed
            confidence: Confidence score of the decision
            response_a_length: Length of response A
            response_b_length: Length of response B
        """
        winner = winner.upper()
        first_position = first_position.upper()

        # Determine if first position won
        first_position_won = 1 if winner == first_position and winner != "TIE" else 0

        conn = self._get_connection()
        try:
            conn.execute(
                """
                INSERT OR REPLACE INTO pairwise_evaluations
                (evaluation_id, winner, first_position, first_position_won,
                 position_consistent, confidence, response_a_length,
                 response_b_length, created_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
            """,
                (
                    evaluation_id,
                    winner,
                    first_position,
                    first_position_won,
                    1 if position_consistent else 0,
                    confidence,
                    response_a_length,
                    response_b_length,
                    datetime.now().isoformat(),
                ),
            )
            conn.commit()
            logger.debug(f"Recorded pairwise evaluation: {evaluation_id}")
        except Exception as e:
            logger.error(f"Failed to record pairwise evaluation: {e}")

    def record_direct_evaluation(
        self,
        evaluation_id: str,
        score: float,
        response_length: int,
        dimension_scores: dict | None = None,
    ):
        """
        Record a direct scoring evaluation.

        Args:
            evaluation_id: Unique identifier for this evaluation
            score: Overall score
            response_length: Length of the response (characters or tokens)
            dimension_scores: Optional per-dimension scores
        """
        conn = self._get_connection()
        try:
            conn.execute(
                """
                INSERT OR REPLACE INTO direct_evaluations
                (evaluation_id, score, response_length, dimension_scores, created_at)
                VALUES (?, ?, ?, ?, ?)
            """,
                (
                    evaluation_id,
                    score,
                    response_length,
                    json.dumps(dimension_scores or {}),
                    datetime.now().isoformat(),
                ),
            )
            conn.commit()
            logger.debug(f"Recorded direct evaluation: {evaluation_id}")
        except Exception as e:
            logger.error(f"Failed to record direct evaluation: {e}")

    def detect_position_bias(self, days: int = 30) -> PositionBiasResult:
        """
        Detect position bias in pairwise comparisons.

        Checks if first-position responses win more often than expected (50%).

        Args:
            days: Number of days to analyze

        Returns:
            PositionBiasResult with bias analysis
        """
        conn = self._get_connection()
        cutoff = (datetime.now() - timedelta(days=days)).isoformat()

        # Get evaluations excluding ties
        rows = conn.execute(
            """
            SELECT first_position_won
            FROM pairwise_evaluations
            WHERE created_at >= ? AND winner != 'TIE'
        """,
            (cutoff,),
        ).fetchall()

        n = len(rows)
        if n < 10:
            return PositionBiasResult(
                bias_detected=False,
                first_position_win_rate=0.0,
                z_score=0.0,
                sample_size=n,
                interpretation="Insufficient data (need at least 10 non-tie comparisons)",
            )

        first_wins = sum(r["first_position_won"] for r in rows)
        expected = n * 0.5
        std_dev = (n * 0.5 * 0.5) ** 0.5

        win_rate = first_wins / n
        z_score = (first_wins - expected) / std_dev if std_dev > 0 else 0

        # Bias detected if z-score > 2 (95% confidence)
        bias_detected = abs(z_score) > 2

        if bias_detected:
            if z_score > 0:
                interpretation = (
                    f"First-position bias detected: {win_rate:.1%} win rate (expected 50%)"
                )
            else:
                interpretation = f"Second-position bias detected: {1 - win_rate:.1%} win rate for second position"
        else:
            interpretation = f"No significant position bias: {win_rate:.1%} first-position win rate"

        return PositionBiasResult(
            bias_detected=bias_detected,
            first_position_win_rate=win_rate,
            z_score=z_score,
            sample_size=n,
            interpretation=interpretation,
        )

    def detect_length_bias(self, days: int = 30) -> LengthBiasResult:
        """
        Detect length bias in direct evaluations.

        Checks if longer responses receive higher scores.

        Args:
            days: Number of days to analyze

        Returns:
            LengthBiasResult with bias analysis
        """
        conn = self._get_connection()
        cutoff = (datetime.now() - timedelta(days=days)).isoformat()

        rows = conn.execute(
            """
            SELECT score, response_length
            FROM direct_evaluations
            WHERE created_at >= ? AND response_length > 0
        """,
            (cutoff,),
        ).fetchall()

        n = len(rows)
        if n < 10:
            return LengthBiasResult(
                bias_detected=False,
                correlation=0.0,
                p_value=1.0,
                sample_size=n,
                interpretation="Insufficient data (need at least 10 evaluations)",
            )

        scores = [r["score"] for r in rows]
        lengths = [r["response_length"] for r in rows]

        # Calculate Spearman correlation between length and score
        rho, p_value = spearmans_rho(lengths, scores)

        # Bias detected if correlation > 0.3 and p < 0.05
        bias_detected = rho > 0.3 and p_value < 0.05

        if bias_detected:
            interpretation = f"Length bias detected: correlation = {rho:.2f} (p = {p_value:.3f})"
        elif rho > 0.2:
            interpretation = f"Weak length bias possible: correlation = {rho:.2f}"
        else:
            interpretation = f"No significant length bias: correlation = {rho:.2f}"

        return LengthBiasResult(
            bias_detected=bias_detected,
            correlation=rho,
            p_value=p_value,
            sample_size=n,
            interpretation=interpretation,
        )

    def get_bias_report(self, days: int = 30) -> BiasReport:
        """
        Generate comprehensive bias report.

        Args:
            days: Number of days to analyze

        Returns:
            BiasReport with all bias analyses and recommendations
        """
        position_bias = self.detect_position_bias(days)
        length_bias = self.detect_length_bias(days)

        conn = self._get_connection()
        cutoff = (datetime.now() - timedelta(days=days)).isoformat()

        # Get total evaluations
        pairwise_count = conn.execute(
            "SELECT COUNT(*) FROM pairwise_evaluations WHERE created_at >= ?", (cutoff,)
        ).fetchone()[0]
        direct_count = conn.execute(
            "SELECT COUNT(*) FROM direct_evaluations WHERE created_at >= ?", (cutoff,)
        ).fetchone()[0]

        total = pairwise_count + direct_count
        warnings = []
        recommendations = []

        # Analyze position bias
        if position_bias.bias_detected:
            warnings.append(f"⚠️ Position Bias: {position_bias.interpretation}")
            recommendations.append("Increase position swap passes or use multi-shuffle comparison")

        # Analyze length bias
        if length_bias.bias_detected:
            warnings.append(f"⚠️ Length Bias: {length_bias.interpretation}")
            recommendations.append("Add explicit length normalization or penalize verbosity")
            recommendations.append("Include 'conciseness' as a separate evaluation criterion")

        # Sample size warnings
        if total < 50:
            warnings.append(f"📊 Small sample size ({total}): metrics may be unreliable")
            recommendations.append("Collect more evaluation data for reliable bias detection")

        # Position consistency check
        consistency = conn.execute(
            """
            SELECT AVG(position_consistent) as avg_consistency
            FROM pairwise_evaluations
            WHERE created_at >= ?
        """,
            (cutoff,),
        ).fetchone()

        if consistency and consistency["avg_consistency"] is not None:
            avg_cons = consistency["avg_consistency"]
            if avg_cons < 0.8:
                warnings.append(f"⚠️ Low position consistency: {avg_cons:.1%}")
                recommendations.append("Review evaluation criteria for ambiguity")

        if not warnings:
            recommendations.append("✅ No significant biases detected!")

        return BiasReport(
            position_bias=position_bias,
            length_bias=length_bias,
            total_evaluations=total,
            evaluation_period_days=days,
            warnings=warnings,
            recommendations=recommendations,
            generated_at=datetime.now().isoformat(),
        )

    def clear_old_data(self, days: int = 90):
        """
        Clear evaluation data older than specified days.

        Args:
            days: Keep data from the last N days
        """
        conn = self._get_connection()
        cutoff = (datetime.now() - timedelta(days=days)).isoformat()

        conn.execute("DELETE FROM pairwise_evaluations WHERE created_at < ?", (cutoff,))
        conn.execute("DELETE FROM direct_evaluations WHERE created_at < ?", (cutoff,))
        conn.commit()
        logger.info(f"Cleared bias monitor data older than {days} days")

`init(project_root)`

Initialize bias monitor.

Parameters:

Name	Type	Description	Default
`project_root`	`Path`	Project root for database storage	required

Source code in src/boring/judge/bias_monitor.py

def __init__(self, project_root: Path):
    """
    Initialize bias monitor.

    Args:
        project_root: Project root for database storage
    """
    self.project_root = Path(project_root)
    self.db_path = self.project_root / ".boring_memory" / "bias_monitor.db"
    self.db_path.parent.mkdir(parents=True, exist_ok=True)
    self._init_db()

`record_pairwise_evaluation(evaluation_id, winner, first_position, position_consistent, confidence=0.0, response_a_length=0, response_b_length=0)`

Record a pairwise comparison result.

Parameters:

Name	Type	Description	Default
`evaluation_id`	`str`	Unique identifier for this evaluation	required
`winner`	`str`	Winner of comparison ('A', 'B', 'TIE')	required
`first_position`	`str`	Which response was in first position ('A' or 'B')	required
`position_consistent`	`bool`	Whether both position passes agreed	required
`confidence`	`float`	Confidence score of the decision	`0.0`
`response_a_length`	`int`	Length of response A	`0`
`response_b_length`	`int`	Length of response B	`0`

Source code in src/boring/judge/bias_monitor.py

def record_pairwise_evaluation(
    self,
    evaluation_id: str,
    winner: str,
    first_position: str,
    position_consistent: bool,
    confidence: float = 0.0,
    response_a_length: int = 0,
    response_b_length: int = 0,
):
    """
    Record a pairwise comparison result.

    Args:
        evaluation_id: Unique identifier for this evaluation
        winner: Winner of comparison ('A', 'B', 'TIE')
        first_position: Which response was in first position ('A' or 'B')
        position_consistent: Whether both position passes agreed
        confidence: Confidence score of the decision
        response_a_length: Length of response A
        response_b_length: Length of response B
    """
    winner = winner.upper()
    first_position = first_position.upper()

    # Determine if first position won
    first_position_won = 1 if winner == first_position and winner != "TIE" else 0

    conn = self._get_connection()
    try:
        conn.execute(
            """
            INSERT OR REPLACE INTO pairwise_evaluations
            (evaluation_id, winner, first_position, first_position_won,
             position_consistent, confidence, response_a_length,
             response_b_length, created_at)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
        """,
            (
                evaluation_id,
                winner,
                first_position,
                first_position_won,
                1 if position_consistent else 0,
                confidence,
                response_a_length,
                response_b_length,
                datetime.now().isoformat(),
            ),
        )
        conn.commit()
        logger.debug(f"Recorded pairwise evaluation: {evaluation_id}")
    except Exception as e:
        logger.error(f"Failed to record pairwise evaluation: {e}")

`record_direct_evaluation(evaluation_id, score, response_length, dimension_scores=None)`

Record a direct scoring evaluation.

Parameters:

Name	Type	Description	Default
`evaluation_id`	`str`	Unique identifier for this evaluation	required
`score`	`float`	Overall score	required
`response_length`	`int`	Length of the response (characters or tokens)	required
`dimension_scores`	`dict \| None`	Optional per-dimension scores	`None`

Source code in src/boring/judge/bias_monitor.py

def record_direct_evaluation(
    self,
    evaluation_id: str,
    score: float,
    response_length: int,
    dimension_scores: dict | None = None,
):
    """
    Record a direct scoring evaluation.

    Args:
        evaluation_id: Unique identifier for this evaluation
        score: Overall score
        response_length: Length of the response (characters or tokens)
        dimension_scores: Optional per-dimension scores
    """
    conn = self._get_connection()
    try:
        conn.execute(
            """
            INSERT OR REPLACE INTO direct_evaluations
            (evaluation_id, score, response_length, dimension_scores, created_at)
            VALUES (?, ?, ?, ?, ?)
        """,
            (
                evaluation_id,
                score,
                response_length,
                json.dumps(dimension_scores or {}),
                datetime.now().isoformat(),
            ),
        )
        conn.commit()
        logger.debug(f"Recorded direct evaluation: {evaluation_id}")
    except Exception as e:
        logger.error(f"Failed to record direct evaluation: {e}")

`detect_position_bias(days=30)`

Detect position bias in pairwise comparisons.

Checks if first-position responses win more often than expected (50%).

Parameters:

Name	Type	Description	Default
`days`	`int`	Number of days to analyze	`30`

Returns:

Type	Description
`PositionBiasResult`	PositionBiasResult with bias analysis

Source code in src/boring/judge/bias_monitor.py

def detect_position_bias(self, days: int = 30) -> PositionBiasResult:
    """
    Detect position bias in pairwise comparisons.

    Checks if first-position responses win more often than expected (50%).

    Args:
        days: Number of days to analyze

    Returns:
        PositionBiasResult with bias analysis
    """
    conn = self._get_connection()
    cutoff = (datetime.now() - timedelta(days=days)).isoformat()

    # Get evaluations excluding ties
    rows = conn.execute(
        """
        SELECT first_position_won
        FROM pairwise_evaluations
        WHERE created_at >= ? AND winner != 'TIE'
    """,
        (cutoff,),
    ).fetchall()

    n = len(rows)
    if n < 10:
        return PositionBiasResult(
            bias_detected=False,
            first_position_win_rate=0.0,
            z_score=0.0,
            sample_size=n,
            interpretation="Insufficient data (need at least 10 non-tie comparisons)",
        )

    first_wins = sum(r["first_position_won"] for r in rows)
    expected = n * 0.5
    std_dev = (n * 0.5 * 0.5) ** 0.5

    win_rate = first_wins / n
    z_score = (first_wins - expected) / std_dev if std_dev > 0 else 0

    # Bias detected if z-score > 2 (95% confidence)
    bias_detected = abs(z_score) > 2

    if bias_detected:
        if z_score > 0:
            interpretation = (
                f"First-position bias detected: {win_rate:.1%} win rate (expected 50%)"
            )
        else:
            interpretation = f"Second-position bias detected: {1 - win_rate:.1%} win rate for second position"
    else:
        interpretation = f"No significant position bias: {win_rate:.1%} first-position win rate"

    return PositionBiasResult(
        bias_detected=bias_detected,
        first_position_win_rate=win_rate,
        z_score=z_score,
        sample_size=n,
        interpretation=interpretation,
    )

`detect_length_bias(days=30)`

Detect length bias in direct evaluations.

Checks if longer responses receive higher scores.

Parameters:

Name	Type	Description	Default
`days`	`int`	Number of days to analyze	`30`

Returns:

Type	Description
`LengthBiasResult`	LengthBiasResult with bias analysis

Source code in src/boring/judge/bias_monitor.py

def detect_length_bias(self, days: int = 30) -> LengthBiasResult:
    """
    Detect length bias in direct evaluations.

    Checks if longer responses receive higher scores.

    Args:
        days: Number of days to analyze

    Returns:
        LengthBiasResult with bias analysis
    """
    conn = self._get_connection()
    cutoff = (datetime.now() - timedelta(days=days)).isoformat()

    rows = conn.execute(
        """
        SELECT score, response_length
        FROM direct_evaluations
        WHERE created_at >= ? AND response_length > 0
    """,
        (cutoff,),
    ).fetchall()

    n = len(rows)
    if n < 10:
        return LengthBiasResult(
            bias_detected=False,
            correlation=0.0,
            p_value=1.0,
            sample_size=n,
            interpretation="Insufficient data (need at least 10 evaluations)",
        )

    scores = [r["score"] for r in rows]
    lengths = [r["response_length"] for r in rows]

    # Calculate Spearman correlation between length and score
    rho, p_value = spearmans_rho(lengths, scores)

    # Bias detected if correlation > 0.3 and p < 0.05
    bias_detected = rho > 0.3 and p_value < 0.05

    if bias_detected:
        interpretation = f"Length bias detected: correlation = {rho:.2f} (p = {p_value:.3f})"
    elif rho > 0.2:
        interpretation = f"Weak length bias possible: correlation = {rho:.2f}"
    else:
        interpretation = f"No significant length bias: correlation = {rho:.2f}"

    return LengthBiasResult(
        bias_detected=bias_detected,
        correlation=rho,
        p_value=p_value,
        sample_size=n,
        interpretation=interpretation,
    )

`get_bias_report(days=30)`

Generate comprehensive bias report.

Parameters:

Name	Type	Description	Default
`days`	`int`	Number of days to analyze	`30`

Returns:

Type	Description
`BiasReport`	BiasReport with all bias analyses and recommendations

Source code in src/boring/judge/bias_monitor.py

def get_bias_report(self, days: int = 30) -> BiasReport:
    """
    Generate comprehensive bias report.

    Args:
        days: Number of days to analyze

    Returns:
        BiasReport with all bias analyses and recommendations
    """
    position_bias = self.detect_position_bias(days)
    length_bias = self.detect_length_bias(days)

    conn = self._get_connection()
    cutoff = (datetime.now() - timedelta(days=days)).isoformat()

    # Get total evaluations
    pairwise_count = conn.execute(
        "SELECT COUNT(*) FROM pairwise_evaluations WHERE created_at >= ?", (cutoff,)
    ).fetchone()[0]
    direct_count = conn.execute(
        "SELECT COUNT(*) FROM direct_evaluations WHERE created_at >= ?", (cutoff,)
    ).fetchone()[0]

    total = pairwise_count + direct_count
    warnings = []
    recommendations = []

    # Analyze position bias
    if position_bias.bias_detected:
        warnings.append(f"⚠️ Position Bias: {position_bias.interpretation}")
        recommendations.append("Increase position swap passes or use multi-shuffle comparison")

    # Analyze length bias
    if length_bias.bias_detected:
        warnings.append(f"⚠️ Length Bias: {length_bias.interpretation}")
        recommendations.append("Add explicit length normalization or penalize verbosity")
        recommendations.append("Include 'conciseness' as a separate evaluation criterion")

    # Sample size warnings
    if total < 50:
        warnings.append(f"📊 Small sample size ({total}): metrics may be unreliable")
        recommendations.append("Collect more evaluation data for reliable bias detection")

    # Position consistency check
    consistency = conn.execute(
        """
        SELECT AVG(position_consistent) as avg_consistency
        FROM pairwise_evaluations
        WHERE created_at >= ?
    """,
        (cutoff,),
    ).fetchone()

    if consistency and consistency["avg_consistency"] is not None:
        avg_cons = consistency["avg_consistency"]
        if avg_cons < 0.8:
            warnings.append(f"⚠️ Low position consistency: {avg_cons:.1%}")
            recommendations.append("Review evaluation criteria for ambiguity")

    if not warnings:
        recommendations.append("✅ No significant biases detected!")

    return BiasReport(
        position_bias=position_bias,
        length_bias=length_bias,
        total_evaluations=total,
        evaluation_period_days=days,
        warnings=warnings,
        recommendations=recommendations,
        generated_at=datetime.now().isoformat(),
    )

`clear_old_data(days=90)`

Clear evaluation data older than specified days.

Parameters:

Name	Type	Description	Default
`days`	`int`	Keep data from the last N days	`90`

Source code in src/boring/judge/bias_monitor.py

def clear_old_data(self, days: int = 90):
    """
    Clear evaluation data older than specified days.

    Args:
        days: Keep data from the last N days
    """
    conn = self._get_connection()
    cutoff = (datetime.now() - timedelta(days=days)).isoformat()

    conn.execute("DELETE FROM pairwise_evaluations WHERE created_at < ?", (cutoff,))
    conn.execute("DELETE FROM direct_evaluations WHERE created_at < ?", (cutoff,))
    conn.commit()
    logger.info(f"Cleared bias monitor data older than {days} days")

`BiasReport` `dataclass`

Comprehensive bias report.

Source code in src/boring/judge/bias_monitor.py

@dataclass
class BiasReport:
    """Comprehensive bias report."""

    position_bias: PositionBiasResult | None = None
    length_bias: LengthBiasResult | None = None
    total_evaluations: int = 0
    evaluation_period_days: int = 30
    warnings: list = field(default_factory=list)
    recommendations: list = field(default_factory=list)
    generated_at: str = ""

`LengthBiasResult` `dataclass`

Result of length bias analysis.

Source code in src/boring/judge/bias_monitor.py

@dataclass
class LengthBiasResult:
    """Result of length bias analysis."""

    bias_detected: bool
    correlation: float
    p_value: float
    sample_size: int
    interpretation: str = ""

`PositionBiasResult` `dataclass`

Result of position bias analysis.

Source code in src/boring/judge/bias_monitor.py

@dataclass
class PositionBiasResult:
    """Result of position bias analysis."""

    bias_detected: bool
    first_position_win_rate: float
    z_score: float
    sample_size: int
    interpretation: str = ""

`LLMJudge`

LLM-as-a-Judge implementation for evaluating code and plans.

V10.25 Enhancements: - Confidence calibration based on position consistency - Length-normalized scoring to mitigate length bias - BiasMonitor integration for systematic bias tracking

Source code in src/boring/judge/core.py

class LLMJudge:
    """
    LLM-as-a-Judge implementation for evaluating code and plans.

    V10.25 Enhancements:
    - Confidence calibration based on position consistency
    - Length-normalized scoring to mitigate length bias
    - BiasMonitor integration for systematic bias tracking
    """

    def __init__(
        self,
        provider: LLMProvider,
        quality_tracker: QualityTracker | None = None,
        project_root: Path | None = None,
        enable_bias_tracking: bool = True,
    ):
        self.cli = (
            provider  # Renaming this would ripple too much, keeping name but typing is generalized
        )
        self.tracker = quality_tracker  # Optional: for automatic history recording
        self.project_root = project_root
        self._bias_monitor = None

        # Initialize bias monitor if enabled and project root provided
        if enable_bias_tracking and project_root:
            try:
                from .bias_monitor import get_bias_monitor

                self._bias_monitor = get_bias_monitor(project_root)
            except ImportError:
                logger.debug("BiasMonitor not available, skipping bias tracking")

    def grade_code(
        self,
        filename: str,
        content: str,
        rubric: Rubric = CODE_QUALITY_RUBRIC,
        interactive: bool = False,
    ) -> dict[str, Any]:
        """
        Evaluate code quality against a rubric.
        If interactive=True, returns the PROMPT for the user to execute using their IDE AI.
        Else, executes via CLI adapter.
        """
        prompt = build_grade_prompt(filename, content, rubric, str(type(self.cli)))

        if interactive:
            # Return the prompts for the host AI (Cursor) to run
            return {
                "score": 0,
                "status": "pending_manual_review",
                "reasoning": "Delegated to Host AI",
                "prompt": prompt,
            }

        try:
            # Call LLM provider
            response = self.cli.chat(prompt, interactive=False)

            # Extract and parse JSON
            result = extract_json(response)
            if result:
                # Record score to quality tracker if available
                if self.tracker and "score" in result:
                    self.tracker.record(result.get("score", 0), 0, context="judge")
                return result
            else:
                logger.warning("No JSON found in judge response")
                return {"score": 0, "reasoning": "Failed to parse judge response", "raw": response}

        except Exception as e:
            logger.error(f"Judge failed: {e}")
            print(f"\n[DEBUG] Judge Exception: {e}")  # Explicit print
            traceback.print_exc()
            return {"score": 0, "reasoning": str(e)}

    def compare_plans(
        self, plan_a: str, plan_b: str, context: str, interactive: bool = False
    ) -> dict[str, Any]:
        """
        Compare two implementation plans and pick a winner.

        Implements Pairwise Comparison with Position Bias Mitigation.
        """
        if interactive:
            # Return both prompts for manual execution
            return {
                "status": "pending_manual_review",
                "prompts": {
                    "pass1": build_comparison_prompt(plan_a, plan_b, "A", "B", context),
                    "pass2": build_comparison_prompt(plan_b, plan_a, "B", "A", context),
                },
                "instructions": "Execute both prompts and compare results. If winners match, that's the final winner. If they differ, the result is TIE.",
            }

        try:
            # First pass: A in position 1, B in position 2
            prompt_pass1 = build_comparison_prompt(plan_a, plan_b, "A", "B", context)
            response_pass1 = self.cli.chat(prompt_pass1, interactive=False)
            result_pass1 = extract_json(response_pass1)

            if not result_pass1:
                return {
                    "winner": "TIE",
                    "confidence": 0.0,
                    "error": "Failed to parse first pass response",
                }

            # Second pass: B in position 1, A in position 2
            prompt_pass2 = build_comparison_prompt(plan_b, plan_a, "B", "A", context)
            response_pass2 = self.cli.chat(prompt_pass2, interactive=False)
            result_pass2 = extract_json(response_pass2)

            if not result_pass2:
                return {
                    "winner": "TIE",
                    "confidence": 0.0,
                    "error": "Failed to parse second pass response",
                }

            # Extract winners (normalize to A/B/TIE)
            winner_pass1 = result_pass1.get("winner", "TIE").upper()
            winner_pass2 = result_pass2.get("winner", "TIE").upper()

            conf_pass1 = float(result_pass1.get("confidence", 0.5))
            conf_pass2 = float(result_pass2.get("confidence", 0.5))

            # Position Bias Mitigation: Check consistency
            consistent = winner_pass1 == winner_pass2

            if consistent:
                final_winner = winner_pass1
                final_confidence = (conf_pass1 + conf_pass2) / 2
            else:
                final_winner = "TIE"
                final_confidence = 0.5

            return {
                "winner": final_winner,
                "confidence": round(final_confidence, 2),
                "positionConsistency": {
                    "consistent": consistent,
                    "pass1": {"winner": winner_pass1, "confidence": conf_pass1},
                    "pass2": {"winner": winner_pass2, "confidence": conf_pass2},
                },
                "reasoning": result_pass1.get("overall_reasoning", "")
                if consistent
                else "Position bias detected - inconsistent results across passes",
            }

        except Exception as e:
            logger.error(f"Plan comparison failed: {e}")
            return {"winner": "TIE", "confidence": 0.0, "error": str(e)}

    def compare_code(
        self,
        name_a: str,
        code_a: str,
        name_b: str,
        code_b: str,
        context: str | None = None,
        interactive: bool = False,
    ) -> dict[str, Any]:
        """
        Compare two code implementations (A/B Test).
        """
        if interactive:
            return {
                "status": "pending_manual_review",
                "prompts": {
                    "pass1": build_code_comparison_prompt(code_a, code_b, "A", "B", context),
                    "pass2": build_code_comparison_prompt(code_b, code_a, "B", "A", context),
                },
                "instructions": "Execute both prompts. If they agree on the winner (swapping A/B), that is the result.",
            }

        try:
            # First pass: A vs B
            prompt_pass1 = build_code_comparison_prompt(code_a, code_b, "A", "B", context)
            response_pass1 = self.cli.chat(prompt_pass1, interactive=False)
            result_pass1 = extract_json(response_pass1)

            if not result_pass1:
                return {"winner": "TIE", "confidence": 0.0, "error": "Failed to parse first pass"}

            # Second pass: B vs A (Position Bias Check)
            prompt_pass2 = build_code_comparison_prompt(code_b, code_a, "B", "A", context)
            response_pass2 = self.cli.chat(prompt_pass2, interactive=False)
            result_pass2 = extract_json(response_pass2)

            if not result_pass2:
                return {"winner": "TIE", "confidence": 0.0, "error": "Failed to parse second pass"}

            winner_pass1 = result_pass1.get("winner", "TIE").upper()
            winner_pass2 = result_pass2.get("winner", "TIE").upper()

            consistent = winner_pass1 == winner_pass2

            if consistent:
                final_winner = winner_pass1
                final_conf = (
                    float(result_pass1.get("confidence", 0.5))
                    + float(result_pass2.get("confidence", 0.5))
                ) / 2
            else:
                final_winner = "TIE"
                final_conf = 0.5

            return {
                "winner": final_winner,
                "confidence": round(final_conf, 2),
                "positionConsistency": consistent,
                "reasoning": result_pass1.get("overall_reasoning", ""),
            }

        except Exception as e:
            logger.error(f"Code comparison failed: {e}")
            return {"winner": "TIE", "confidence": 0.0, "error": str(e)}

    def _extract_json(self, response: str) -> dict[str, Any] | None:
        """Deprecated: Internal wrapper for backward compatibility within class."""
        return extract_json(response)

    def _build_grade_prompt(self, filename: str, content: str, rubric: Rubric) -> str:
        """Compatibility wrapper for build_grade_prompt."""
        return build_grade_prompt(filename, content, rubric, str(type(self.cli)))

    # =========================================================================
    # V10.25: Advanced Evaluation Methods
    # =========================================================================

    def calibrate_confidence(
        self,
        raw_confidence: float,
        position_consistent: bool,
        evidence_count: int = 0,
    ) -> float:
        """
        Calibrate confidence based on multiple signals.

        Args:
            raw_confidence: Raw confidence from model output (0-1)
            position_consistent: Whether position swap passes agreed
            evidence_count: Number of evidence items supporting the judgment

        Returns:
            Calibrated confidence score (0-1)
        """
        calibrated = raw_confidence

        # Position consistency is a strong signal
        if not position_consistent:
            calibrated *= 0.6  # Significant reduction for inconsistency

        # More evidence = higher confidence
        evidence_factor = min(evidence_count / 3, 1.0)  # Cap at 3 pieces
        calibrated *= 0.7 + 0.3 * evidence_factor

        return min(calibrated, 0.99)  # Never 100% confident

    def length_normalized_score(
        self,
        score: float,
        response_length: int,
        target_length: int = 500,
        max_penalty: float = 0.5,
    ) -> float:
        """
        Adjust score based on response length to mitigate length bias.

        Args:
            score: Original score
            response_length: Length of the response (characters)
            target_length: Expected typical length
            max_penalty: Maximum penalty to apply

        Returns:
            Length-adjusted score
        """
        length_ratio = response_length / target_length if target_length > 0 else 1.0

        if length_ratio > 2.0:
            # Penalize excessively long responses
            penalty = min((length_ratio - 2.0) * 0.1, max_penalty)
            return max(score - penalty, 1.0)
        elif length_ratio < 0.3:
            # Penalize excessively short responses
            penalty = min((0.3 - length_ratio) * 0.5, max_penalty)
            return max(score - penalty, 1.0)
        else:
            return score

    def get_bias_report(self, days: int = 30) -> dict | None:
        """
        Get bias monitoring report.

        Args:
            days: Number of days to analyze

        Returns:
            Bias report dict or None if monitoring not available
        """
        if self._bias_monitor is None:
            return None

        try:
            from .bias_monitor import format_bias_report

            report = self._bias_monitor.get_bias_report(days)
            return {
                "formatted": format_bias_report(report),
                "position_bias": {
                    "detected": report.position_bias.bias_detected
                    if report.position_bias
                    else False,
                    "first_position_win_rate": report.position_bias.first_position_win_rate
                    if report.position_bias
                    else 0,
                },
                "length_bias": {
                    "detected": report.length_bias.bias_detected if report.length_bias else False,
                    "correlation": report.length_bias.correlation if report.length_bias else 0,
                },
                "warnings": report.warnings,
                "recommendations": report.recommendations,
            }
        except Exception as e:
            logger.error(f"Failed to get bias report: {e}")
            return None

    def _record_evaluation(
        self,
        evaluation_type: str,
        result: dict,
        response_length: int = 0,
    ):
        """Record evaluation to bias monitor."""
        if self._bias_monitor is None:
            return

        try:
            eval_id = str(uuid.uuid4())[:8]

            if evaluation_type == "pairwise":
                self._bias_monitor.record_pairwise_evaluation(
                    evaluation_id=eval_id,
                    winner=result.get("winner", "TIE"),
                    first_position="A",  # A is always first in our implementation
                    position_consistent=result.get("positionConsistency", False),
                    confidence=result.get("confidence", 0.0),
                )
            elif evaluation_type == "direct":
                self._bias_monitor.record_direct_evaluation(
                    evaluation_id=eval_id,
                    score=result.get("score", 0),
                    response_length=response_length,
                    dimension_scores=result.get("dimensions"),
                )
        except Exception as e:
            logger.debug(f"Failed to record evaluation: {e}")

`grade_code(filename, content, rubric=CODE_QUALITY_RUBRIC, interactive=False)`

Evaluate code quality against a rubric. If interactive=True, returns the PROMPT for the user to execute using their IDE AI. Else, executes via CLI adapter.

Source code in src/boring/judge/core.py

def grade_code(
    self,
    filename: str,
    content: str,
    rubric: Rubric = CODE_QUALITY_RUBRIC,
    interactive: bool = False,
) -> dict[str, Any]:
    """
    Evaluate code quality against a rubric.
    If interactive=True, returns the PROMPT for the user to execute using their IDE AI.
    Else, executes via CLI adapter.
    """
    prompt = build_grade_prompt(filename, content, rubric, str(type(self.cli)))

    if interactive:
        # Return the prompts for the host AI (Cursor) to run
        return {
            "score": 0,
            "status": "pending_manual_review",
            "reasoning": "Delegated to Host AI",
            "prompt": prompt,
        }

    try:
        # Call LLM provider
        response = self.cli.chat(prompt, interactive=False)

        # Extract and parse JSON
        result = extract_json(response)
        if result:
            # Record score to quality tracker if available
            if self.tracker and "score" in result:
                self.tracker.record(result.get("score", 0), 0, context="judge")
            return result
        else:
            logger.warning("No JSON found in judge response")
            return {"score": 0, "reasoning": "Failed to parse judge response", "raw": response}

    except Exception as e:
        logger.error(f"Judge failed: {e}")
        print(f"\n[DEBUG] Judge Exception: {e}")  # Explicit print
        traceback.print_exc()
        return {"score": 0, "reasoning": str(e)}

`compare_plans(plan_a, plan_b, context, interactive=False)`

Compare two implementation plans and pick a winner.

Implements Pairwise Comparison with Position Bias Mitigation.

Source code in src/boring/judge/core.py

def compare_plans(
    self, plan_a: str, plan_b: str, context: str, interactive: bool = False
) -> dict[str, Any]:
    """
    Compare two implementation plans and pick a winner.

    Implements Pairwise Comparison with Position Bias Mitigation.
    """
    if interactive:
        # Return both prompts for manual execution
        return {
            "status": "pending_manual_review",
            "prompts": {
                "pass1": build_comparison_prompt(plan_a, plan_b, "A", "B", context),
                "pass2": build_comparison_prompt(plan_b, plan_a, "B", "A", context),
            },
            "instructions": "Execute both prompts and compare results. If winners match, that's the final winner. If they differ, the result is TIE.",
        }

    try:
        # First pass: A in position 1, B in position 2
        prompt_pass1 = build_comparison_prompt(plan_a, plan_b, "A", "B", context)
        response_pass1 = self.cli.chat(prompt_pass1, interactive=False)
        result_pass1 = extract_json(response_pass1)

        if not result_pass1:
            return {
                "winner": "TIE",
                "confidence": 0.0,
                "error": "Failed to parse first pass response",
            }

        # Second pass: B in position 1, A in position 2
        prompt_pass2 = build_comparison_prompt(plan_b, plan_a, "B", "A", context)
        response_pass2 = self.cli.chat(prompt_pass2, interactive=False)
        result_pass2 = extract_json(response_pass2)

        if not result_pass2:
            return {
                "winner": "TIE",
                "confidence": 0.0,
                "error": "Failed to parse second pass response",
            }

        # Extract winners (normalize to A/B/TIE)
        winner_pass1 = result_pass1.get("winner", "TIE").upper()
        winner_pass2 = result_pass2.get("winner", "TIE").upper()

        conf_pass1 = float(result_pass1.get("confidence", 0.5))
        conf_pass2 = float(result_pass2.get("confidence", 0.5))

        # Position Bias Mitigation: Check consistency
        consistent = winner_pass1 == winner_pass2

        if consistent:
            final_winner = winner_pass1
            final_confidence = (conf_pass1 + conf_pass2) / 2
        else:
            final_winner = "TIE"
            final_confidence = 0.5

        return {
            "winner": final_winner,
            "confidence": round(final_confidence, 2),
            "positionConsistency": {
                "consistent": consistent,
                "pass1": {"winner": winner_pass1, "confidence": conf_pass1},
                "pass2": {"winner": winner_pass2, "confidence": conf_pass2},
            },
            "reasoning": result_pass1.get("overall_reasoning", "")
            if consistent
            else "Position bias detected - inconsistent results across passes",
        }

    except Exception as e:
        logger.error(f"Plan comparison failed: {e}")
        return {"winner": "TIE", "confidence": 0.0, "error": str(e)}

`compare_code(name_a, code_a, name_b, code_b, context=None, interactive=False)`

Compare two code implementations (A/B Test).

Source code in src/boring/judge/core.py

def compare_code(
    self,
    name_a: str,
    code_a: str,
    name_b: str,
    code_b: str,
    context: str | None = None,
    interactive: bool = False,
) -> dict[str, Any]:
    """
    Compare two code implementations (A/B Test).
    """
    if interactive:
        return {
            "status": "pending_manual_review",
            "prompts": {
                "pass1": build_code_comparison_prompt(code_a, code_b, "A", "B", context),
                "pass2": build_code_comparison_prompt(code_b, code_a, "B", "A", context),
            },
            "instructions": "Execute both prompts. If they agree on the winner (swapping A/B), that is the result.",
        }

    try:
        # First pass: A vs B
        prompt_pass1 = build_code_comparison_prompt(code_a, code_b, "A", "B", context)
        response_pass1 = self.cli.chat(prompt_pass1, interactive=False)
        result_pass1 = extract_json(response_pass1)

        if not result_pass1:
            return {"winner": "TIE", "confidence": 0.0, "error": "Failed to parse first pass"}

        # Second pass: B vs A (Position Bias Check)
        prompt_pass2 = build_code_comparison_prompt(code_b, code_a, "B", "A", context)
        response_pass2 = self.cli.chat(prompt_pass2, interactive=False)
        result_pass2 = extract_json(response_pass2)

        if not result_pass2:
            return {"winner": "TIE", "confidence": 0.0, "error": "Failed to parse second pass"}

        winner_pass1 = result_pass1.get("winner", "TIE").upper()
        winner_pass2 = result_pass2.get("winner", "TIE").upper()

        consistent = winner_pass1 == winner_pass2

        if consistent:
            final_winner = winner_pass1
            final_conf = (
                float(result_pass1.get("confidence", 0.5))
                + float(result_pass2.get("confidence", 0.5))
            ) / 2
        else:
            final_winner = "TIE"
            final_conf = 0.5

        return {
            "winner": final_winner,
            "confidence": round(final_conf, 2),
            "positionConsistency": consistent,
            "reasoning": result_pass1.get("overall_reasoning", ""),
        }

    except Exception as e:
        logger.error(f"Code comparison failed: {e}")
        return {"winner": "TIE", "confidence": 0.0, "error": str(e)}

`calibrate_confidence(raw_confidence, position_consistent, evidence_count=0)`

Calibrate confidence based on multiple signals.

Parameters:

Name	Type	Description	Default
`raw_confidence`	`float`	Raw confidence from model output (0-1)	required
`position_consistent`	`bool`	Whether position swap passes agreed	required
`evidence_count`	`int`	Number of evidence items supporting the judgment	`0`

Returns:

Type	Description
`float`	Calibrated confidence score (0-1)

Source code in src/boring/judge/core.py

def calibrate_confidence(
    self,
    raw_confidence: float,
    position_consistent: bool,
    evidence_count: int = 0,
) -> float:
    """
    Calibrate confidence based on multiple signals.

    Args:
        raw_confidence: Raw confidence from model output (0-1)
        position_consistent: Whether position swap passes agreed
        evidence_count: Number of evidence items supporting the judgment

    Returns:
        Calibrated confidence score (0-1)
    """
    calibrated = raw_confidence

    # Position consistency is a strong signal
    if not position_consistent:
        calibrated *= 0.6  # Significant reduction for inconsistency

    # More evidence = higher confidence
    evidence_factor = min(evidence_count / 3, 1.0)  # Cap at 3 pieces
    calibrated *= 0.7 + 0.3 * evidence_factor

    return min(calibrated, 0.99)  # Never 100% confident

`length_normalized_score(score, response_length, target_length=500, max_penalty=0.5)`

Adjust score based on response length to mitigate length bias.

Parameters:

Name	Type	Description	Default
`score`	`float`	Original score	required
`response_length`	`int`	Length of the response (characters)	required
`target_length`	`int`	Expected typical length	`500`
`max_penalty`	`float`	Maximum penalty to apply	`0.5`

Returns:

Type	Description
`float`	Length-adjusted score

Source code in src/boring/judge/core.py

def length_normalized_score(
    self,
    score: float,
    response_length: int,
    target_length: int = 500,
    max_penalty: float = 0.5,
) -> float:
    """
    Adjust score based on response length to mitigate length bias.

    Args:
        score: Original score
        response_length: Length of the response (characters)
        target_length: Expected typical length
        max_penalty: Maximum penalty to apply

    Returns:
        Length-adjusted score
    """
    length_ratio = response_length / target_length if target_length > 0 else 1.0

    if length_ratio > 2.0:
        # Penalize excessively long responses
        penalty = min((length_ratio - 2.0) * 0.1, max_penalty)
        return max(score - penalty, 1.0)
    elif length_ratio < 0.3:
        # Penalize excessively short responses
        penalty = min((0.3 - length_ratio) * 0.5, max_penalty)
        return max(score - penalty, 1.0)
    else:
        return score

`get_bias_report(days=30)`

Get bias monitoring report.

Parameters:

Name	Type	Description	Default
`days`	`int`	Number of days to analyze	`30`

Returns:

Type	Description
`dict \| None`	Bias report dict or None if monitoring not available

Source code in src/boring/judge/core.py

def get_bias_report(self, days: int = 30) -> dict | None:
    """
    Get bias monitoring report.

    Args:
        days: Number of days to analyze

    Returns:
        Bias report dict or None if monitoring not available
    """
    if self._bias_monitor is None:
        return None

    try:
        from .bias_monitor import format_bias_report

        report = self._bias_monitor.get_bias_report(days)
        return {
            "formatted": format_bias_report(report),
            "position_bias": {
                "detected": report.position_bias.bias_detected
                if report.position_bias
                else False,
                "first_position_win_rate": report.position_bias.first_position_win_rate
                if report.position_bias
                else 0,
            },
            "length_bias": {
                "detected": report.length_bias.bias_detected if report.length_bias else False,
                "correlation": report.length_bias.correlation if report.length_bias else 0,
            },
            "warnings": report.warnings,
            "recommendations": report.recommendations,
        }
    except Exception as e:
        logger.error(f"Failed to get bias report: {e}")
        return None

`DetailedCriterion` `dataclass`

A criterion with detailed level descriptions.

Source code in src/boring/judge/rubric_generator.py

@dataclass
class DetailedCriterion:
    """A criterion with detailed level descriptions."""

    name: str
    description: str
    weight: float = 1.0
    levels: list[RubricLevel] = field(default_factory=list)
    edge_cases: list[EdgeCase] = field(default_factory=list)

`DetailedRubric` `dataclass`

A complete rubric with all level descriptions and edge cases.

Source code in src/boring/judge/rubric_generator.py

@dataclass
class DetailedRubric:
    """A complete rubric with all level descriptions and edge cases."""

    name: str
    description: str
    domain: str
    scale: str  # "1-3", "1-5", "1-10"
    strictness: str  # "lenient", "balanced", "strict"
    criteria: list[DetailedCriterion] = field(default_factory=list)
    general_edge_cases: list[EdgeCase] = field(default_factory=list)
    scoring_guidelines: list[str] = field(default_factory=list)

`EdgeCase` `dataclass`

Edge case guidance for consistent evaluation.

Source code in src/boring/judge/rubric_generator.py

@dataclass
class EdgeCase:
    """Edge case guidance for consistent evaluation."""

    situation: str
    guidance: str

`RubricLevel` `dataclass`

A level in the rubric with detailed description.

Source code in src/boring/judge/rubric_generator.py

@dataclass
class RubricLevel:
    """A level in the rubric with detailed description."""

    score: int
    label: str
    description: str
    characteristics: list[str] = field(default_factory=list)

`format_bias_report(report)`

Format bias report as markdown.

Source code in src/boring/judge/bias_monitor.py

def format_bias_report(report: BiasReport) -> str:
    """Format bias report as markdown."""
    lines = ["# 🔍 Bias Monitoring Report", ""]

    lines.append(f"**Period**: Last {report.evaluation_period_days} days")
    lines.append(f"**Total Evaluations**: {report.total_evaluations}")
    lines.append(f"**Generated**: {report.generated_at[:19]}")
    lines.append("")

    # Position Bias
    if report.position_bias:
        pb = report.position_bias
        emoji = "🔴" if pb.bias_detected else "🟢"
        lines.append(f"## {emoji} Position Bias")
        lines.append("")
        lines.append("| Metric | Value |")
        lines.append("|--------|-------|")
        lines.append(f"| First Position Win Rate | {pb.first_position_win_rate:.1%} |")
        lines.append(f"| Z-Score | {pb.z_score:.2f} |")
        lines.append(f"| Sample Size | {pb.sample_size} |")
        lines.append("")
        lines.append(f"**Analysis**: {pb.interpretation}")
        lines.append("")

    # Length Bias
    if report.length_bias:
        lb = report.length_bias
        emoji = "🔴" if lb.bias_detected else "🟢"
        lines.append(f"## {emoji} Length Bias")
        lines.append("")
        lines.append("| Metric | Value |")
        lines.append("|--------|-------|")
        lines.append(f"| Length-Score Correlation | {lb.correlation:.3f} |")
        lines.append(f"| P-Value | {lb.p_value:.4f} |")
        lines.append(f"| Sample Size | {lb.sample_size} |")
        lines.append("")
        lines.append(f"**Analysis**: {lb.interpretation}")
        lines.append("")

    # Warnings
    if report.warnings:
        lines.append("## Warnings")
        lines.append("")
        for warning in report.warnings:
            lines.append(f"- {warning}")
        lines.append("")

    # Recommendations
    if report.recommendations:
        lines.append("## Recommendations")
        lines.append("")
        for rec in report.recommendations:
            lines.append(f"- {rec}")
        lines.append("")

    return "\n".join(lines)

`get_bias_monitor(project_root)`

Get or create bias monitor singleton.

Source code in src/boring/judge/bias_monitor.py

def get_bias_monitor(project_root: Path) -> BiasMonitor:
    """Get or create bias monitor singleton."""
    global _bias_monitor
    if _bias_monitor is None:
        _bias_monitor = BiasMonitor(project_root)
    return _bias_monitor

`create_judge_provider()`

Factory to create the appropriate LLM provider based on config.

Source code in src/boring/judge/factory.py

def create_judge_provider() -> LLMProvider:
    """Factory to create the appropriate LLM provider based on config."""
    provider_type = settings.LLM_PROVIDER.lower()

    if provider_type == "ollama":
        return OllamaProvider(
            model_name=settings.LLM_MODEL or "llama3",
            base_url=settings.LLM_BASE_URL or "http://localhost:11434",
            log_dir=settings.LOG_DIR,
        )
    elif provider_type == "openai_compat" or provider_type == "lmstudio":
        return OpenAICompatProvider(
            model_name=settings.LLM_MODEL or "local-model",
            base_url=settings.LLM_BASE_URL or "http://localhost:1234/v1",
            log_dir=settings.LOG_DIR,
        )
    else:
        # Default to Gemini (CLI Adapter for now, as Judge typically runs via CLI)
        return create_cli_adapter(model_name=settings.DEFAULT_MODEL, log_dir=settings.LOG_DIR)

`agreement_metrics(judge1, judge2, ordinal=False)`

Calculate all agreement metrics.

Parameters:

Name	Type	Description	Default
`judge1`	`list`	Ratings from first judge	required
`judge2`	`list`	Ratings from second judge	required
`ordinal`	`bool`	If True, calculate weighted kappa for ordinal scales	`False`

Returns:

Type	Description
`AgreementMetrics`	AgreementMetrics with all metrics

Source code in src/boring/judge/metrics.py

def agreement_metrics(judge1: list, judge2: list, ordinal: bool = False) -> AgreementMetrics:
    """
    Calculate all agreement metrics.

    Args:
        judge1: Ratings from first judge
        judge2: Ratings from second judge
        ordinal: If True, calculate weighted kappa for ordinal scales

    Returns:
        AgreementMetrics with all metrics
    """
    n = len(judge1)
    observed = sum(1 for j1, j2 in zip(judge1, judge2, strict=True) if j1 == j2) / n if n > 0 else 0

    kappa = cohens_kappa(judge1, judge2)
    w_kappa = weighted_kappa(judge1, judge2) if ordinal else None

    # Calculate expected agreement
    categories = list(set(judge1) | set(judge2))
    expected = 0.0
    for cat in categories:
        p1 = sum(1 for j in judge1 if j == cat) / n if n > 0 else 0
        p2 = sum(1 for j in judge2 if j == cat) / n if n > 0 else 0
        expected += p1 * p2

    return AgreementMetrics(
        cohens_kappa=kappa,
        weighted_kappa=w_kappa,
        observed_agreement=observed,
        expected_agreement=expected,
        interpretation=interpret_kappa(kappa),
    )

`classification_metrics(predictions, ground_truth)`

Calculate all classification metrics.

Parameters:

Name	Type	Description	Default
`predictions`	`list[int]`	List of predicted labels	required
`ground_truth`	`list[int]`	List of actual labels	required

Returns:

Type	Description
`ClassificationMetrics`	ClassificationMetrics with all metrics

Source code in src/boring/judge/metrics.py

def classification_metrics(
    predictions: list[int], ground_truth: list[int]
) -> ClassificationMetrics:
    """
    Calculate all classification metrics.

    Args:
        predictions: List of predicted labels
        ground_truth: List of actual labels

    Returns:
        ClassificationMetrics with all metrics
    """
    if len(predictions) != len(ground_truth):
        raise ValueError("Predictions and ground truth must have same length")

    tp = sum(1 for p, g in zip(predictions, ground_truth, strict=True) if p == 1 and g == 1)
    fp = sum(1 for p, g in zip(predictions, ground_truth, strict=True) if p == 1 and g == 0)
    fn = sum(1 for p, g in zip(predictions, ground_truth, strict=True) if p == 0 and g == 1)
    tn = sum(1 for p, g in zip(predictions, ground_truth, strict=True) if p == 0 and g == 0)

    prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0

    return ClassificationMetrics(
        precision=prec,
        recall=rec,
        f1_score=f1,
        true_positives=tp,
        false_positives=fp,
        false_negatives=fn,
        true_negatives=tn,
    )

`cohens_kappa(judge1, judge2)`

Calculate Cohen's Kappa for inter-rater agreement.

κ = (Observed Agreement - Expected Agreement) / (1 - Expected Agreement)

Parameters:

Name	Type	Description	Default
`judge1`	`list`	Ratings from first judge	required
`judge2`	`list`	Ratings from second judge	required

Returns:

Type	Description
`float`	Cohen's Kappa (-1.0 to 1.0)

Source code in src/boring/judge/metrics.py

def cohens_kappa(judge1: list, judge2: list) -> float:
    """
    Calculate Cohen's Kappa for inter-rater agreement.

    κ = (Observed Agreement - Expected Agreement) / (1 - Expected Agreement)

    Args:
        judge1: Ratings from first judge
        judge2: Ratings from second judge

    Returns:
        Cohen's Kappa (-1.0 to 1.0)
    """
    if len(judge1) != len(judge2):
        raise ValueError("Judge ratings must have same length")

    n = len(judge1)
    if n == 0:
        return 0.0

    # Get all unique categories
    categories = list(set(judge1) | set(judge2))

    # Count agreements
    observed_agreement = sum(1 for j1, j2 in zip(judge1, judge2, strict=True) if j1 == j2) / n

    # Calculate expected agreement by chance
    expected_agreement = 0.0
    for cat in categories:
        p1 = sum(1 for j in judge1 if j == cat) / n
        p2 = sum(1 for j in judge2 if j == cat) / n
        expected_agreement += p1 * p2

    # Calculate kappa
    if expected_agreement == 1.0:
        return 1.0 if observed_agreement == 1.0 else 0.0

    kappa = (observed_agreement - expected_agreement) / (1 - expected_agreement)
    return kappa

`correlation_metrics(scores1, scores2)`

Calculate all correlation metrics.

Parameters:

Name	Type	Description	Default
`scores1`	`list[float]`	First set of scores	required
`scores2`	`list[float]`	Second set of scores	required

Returns:

Type	Description
`CorrelationMetrics`	CorrelationMetrics with all metrics

Source code in src/boring/judge/metrics.py

def correlation_metrics(scores1: list[float], scores2: list[float]) -> CorrelationMetrics:
    """
    Calculate all correlation metrics.

    Args:
        scores1: First set of scores
        scores2: Second set of scores

    Returns:
        CorrelationMetrics with all metrics
    """
    rho, p_rho = spearmans_rho(scores1, scores2)
    tau, p_tau = kendalls_tau(scores1, scores2)
    r, p_r = pearsons_r(scores1, scores2)

    return CorrelationMetrics(
        spearmans_rho=rho,
        kendalls_tau=tau,
        pearsons_r=r,
        p_value_spearman=p_rho,
        p_value_kendall=p_tau,
        p_value_pearson=p_r,
        interpretation=interpret_correlation(rho),
    )

`f1_score(predictions, ground_truth)`

Calculate F1 score: 2 * (precision * recall) / (precision + recall)

Parameters:

Name	Type	Description	Default
`predictions`	`list[int]`	List of predicted labels	required
`ground_truth`	`list[int]`	List of actual labels	required

Returns:

Type	Description
`float`	F1 score (0.0 to 1.0)

Source code in src/boring/judge/metrics.py

def f1_score(predictions: list[int], ground_truth: list[int]) -> float:
    """
    Calculate F1 score: 2 * (precision * recall) / (precision + recall)

    Args:
        predictions: List of predicted labels
        ground_truth: List of actual labels

    Returns:
        F1 score (0.0 to 1.0)
    """
    p = precision(predictions, ground_truth)
    r = recall(predictions, ground_truth)
    return 2 * p * r / (p + r) if (p + r) > 0 else 0.0

`format_metrics_report(report)`

Format metrics report as markdown.

Source code in src/boring/judge/metrics.py

def format_metrics_report(report: EvaluationMetricsReport) -> str:
    """Format metrics report as markdown."""
    lines = ["# 📊 Evaluation Metrics Report", ""]

    lines.append(f"**Evaluation Type**: {report.evaluation_type}")
    lines.append(f"**Sample Size**: {report.sample_size}")
    lines.append("")

    # Classification metrics
    if report.classification:
        lines.append("## Classification Metrics")
        lines.append("")
        lines.append("| Metric | Value |")
        lines.append("|--------|-------|")
        lines.append(f"| Precision | {report.classification.precision:.3f} |")
        lines.append(f"| Recall | {report.classification.recall:.3f} |")
        lines.append(f"| F1 Score | {report.classification.f1_score:.3f} |")
        lines.append("")

    # Correlation metrics
    if report.correlation:
        lines.append("## Correlation Metrics")
        lines.append("")
        lines.append("| Metric | Value | p-value |")
        lines.append("|--------|-------|---------|")
        lines.append(
            f"| Spearman's ρ | {report.correlation.spearmans_rho:.3f} | {report.correlation.p_value_spearman:.4f} |"
        )
        lines.append(
            f"| Kendall's τ | {report.correlation.kendalls_tau:.3f} | {report.correlation.p_value_kendall:.4f} |"
        )
        lines.append(
            f"| Pearson's r | {report.correlation.pearsons_r:.3f} | {report.correlation.p_value_pearson:.4f} |"
        )
        lines.append("")
        lines.append(f"**Interpretation**: {report.correlation.interpretation}")
        lines.append("")

    # Agreement metrics
    if report.agreement:
        lines.append("## Agreement Metrics")
        lines.append("")
        lines.append("| Metric | Value |")
        lines.append("|--------|-------|")
        lines.append(f"| Cohen's κ | {report.agreement.cohens_kappa:.3f} |")
        if report.agreement.weighted_kappa is not None:
            lines.append(f"| Weighted κ | {report.agreement.weighted_kappa:.3f} |")
        lines.append(f"| Observed Agreement | {report.agreement.observed_agreement:.3f} |")
        lines.append("")
        lines.append(f"**Interpretation**: {report.agreement.interpretation}")
        lines.append("")

    # Pairwise metrics
    if report.pairwise:
        lines.append("## Pairwise Comparison Metrics")
        lines.append("")
        lines.append("| Metric | Value |")
        lines.append("|--------|-------|")
        lines.append(f"| Position Consistency | {report.pairwise.position_consistency:.1%} |")
        lines.append(f"| Tie Rate | {report.pairwise.tie_rate:.1%} |")
        lines.append(f"| Total Comparisons | {report.pairwise.total_comparisons} |")
        lines.append("")

    # Warnings
    if report.warnings:
        lines.append("## ⚠️ Warnings")
        lines.append("")
        for warning in report.warnings:
            lines.append(f"- {warning}")
        lines.append("")

    # Recommendations
    if report.recommendations:
        lines.append("## 💡 Recommendations")
        lines.append("")
        for rec in report.recommendations:
            lines.append(f"- {rec}")
        lines.append("")

    return "\n".join(lines)

`generate_metrics_report(automated_scores=None, human_scores=None, predictions=None, ground_truth=None, pairwise_comparisons=None, evaluation_type='general')`

Generate comprehensive evaluation metrics report.

Parameters:

Name	Type	Description	Default
`automated_scores`	`list[float] \| None`	Scores from automated evaluation	`None`
`human_scores`	`list[float] \| None`	Scores from human evaluation	`None`
`predictions`	`list[int] \| None`	Binary predictions (for classification)	`None`
`ground_truth`	`list[int] \| None`	Ground truth labels (for classification)	`None`
`pairwise_comparisons`	`list[dict] \| None`	Pairwise comparison results	`None`
`evaluation_type`	`str`	Type of evaluation (ordinal, binary, pairwise)	`'general'`

Returns:

Type	Description
`EvaluationMetricsReport`	EvaluationMetricsReport with all applicable metrics

Source code in src/boring/judge/metrics.py

def generate_metrics_report(
    automated_scores: list[float] | None = None,
    human_scores: list[float] | None = None,
    predictions: list[int] | None = None,
    ground_truth: list[int] | None = None,
    pairwise_comparisons: list[dict] | None = None,
    evaluation_type: str = "general",
) -> EvaluationMetricsReport:
    """
    Generate comprehensive evaluation metrics report.

    Args:
        automated_scores: Scores from automated evaluation
        human_scores: Scores from human evaluation
        predictions: Binary predictions (for classification)
        ground_truth: Ground truth labels (for classification)
        pairwise_comparisons: Pairwise comparison results
        evaluation_type: Type of evaluation (ordinal, binary, pairwise)

    Returns:
        EvaluationMetricsReport with all applicable metrics
    """
    report = EvaluationMetricsReport(evaluation_type=evaluation_type)
    warnings = []
    recommendations = []

    # Classification metrics
    if predictions is not None and ground_truth is not None:
        report.classification = classification_metrics(predictions, ground_truth)
        report.sample_size = len(predictions)

        if report.classification.precision < 0.7:
            warnings.append("Precision below 0.7 - high false positive rate")
        if report.classification.recall < 0.7:
            warnings.append("Recall below 0.7 - high false negative rate")

    # Correlation/Agreement metrics
    if automated_scores is not None and human_scores is not None:
        report.correlation = correlation_metrics(automated_scores, human_scores)
        report.sample_size = len(automated_scores)

        # Agreement for ordinal scales
        if evaluation_type == "ordinal":
            # Round to integers for agreement calculation
            j1 = [round(s) for s in automated_scores]
            j2 = [round(s) for s in human_scores]
            report.agreement = agreement_metrics(j1, j2, ordinal=True)

        if report.correlation.spearmans_rho < 0.6:
            warnings.append("Spearman's ρ below 0.6 - weak correlation with human judgment")
            recommendations.append("Review evaluation criteria for clarity")

    # Pairwise metrics
    if pairwise_comparisons is not None:
        report.pairwise = pairwise_metrics(pairwise_comparisons)
        report.sample_size = len(pairwise_comparisons)

        if report.pairwise.position_consistency < 0.8:
            warnings.append("Position consistency below 0.8 - position bias may be present")
            recommendations.append("Increase number of position swaps or use multiple passes")

        if report.pairwise.tie_rate > 0.3:
            warnings.append("High tie rate (>30%) - criteria may need refinement")

    # Sample size warning
    if report.sample_size < 50:
        warnings.append(f"Small sample size ({report.sample_size}) - metrics may be unreliable")
        recommendations.append("Collect more evaluation samples for reliable metrics")

    report.warnings = warnings
    report.recommendations = recommendations

    return report

`kendalls_tau(scores1, scores2)`

Calculate Kendall's tau correlation coefficient.

Parameters:

Name	Type	Description	Default
`scores1`	`list[float]`	First set of scores	required
`scores2`	`list[float]`	Second set of scores	required

Returns:

Type	Description
`tuple[float, float]`	Tuple of (tau, p_value)

Source code in src/boring/judge/metrics.py

def kendalls_tau(scores1: list[float], scores2: list[float]) -> tuple[float, float]:
    """
    Calculate Kendall's tau correlation coefficient.

    Args:
        scores1: First set of scores
        scores2: Second set of scores

    Returns:
        Tuple of (tau, p_value)
    """
    if len(scores1) != len(scores2):
        raise ValueError("Score lists must have same length")

    n = len(scores1)
    if n < 2:
        return 0.0, 1.0

    concordant = 0
    discordant = 0

    for i in range(n):
        for j in range(i + 1, n):
            x_diff = scores1[i] - scores1[j]
            y_diff = scores2[i] - scores2[j]

            if x_diff * y_diff > 0:
                concordant += 1
            elif x_diff * y_diff < 0:
                discordant += 1
            # Ties are ignored

    total = concordant + discordant
    if total == 0:
        return 0.0, 1.0

    tau = (concordant - discordant) / total

    # Approximate p-value
    var = (2 * (2 * n + 5)) / (9 * n * (n - 1))
    z = tau / math.sqrt(var) if var > 0 else 0
    p_value = 2 * (1 - _cdf_normal(abs(z)))

    return tau, p_value

`pairwise_metrics(comparisons)`

Calculate all pairwise comparison metrics.

Parameters:

Name	Type	Description	Default
`comparisons`	`list[dict]`	List of comparison results with 'winner' and 'position_consistent' fields	required

Returns:

Type	Description
`PairwiseMetrics`	PairwiseMetrics with all metrics

Source code in src/boring/judge/metrics.py

def pairwise_metrics(comparisons: list[dict]) -> PairwiseMetrics:
    """
    Calculate all pairwise comparison metrics.

    Args:
        comparisons: List of comparison results with 'winner' and 'position_consistent' fields

    Returns:
        PairwiseMetrics with all metrics
    """
    if not comparisons:
        return PairwiseMetrics(
            agreement_rate=0.0,
            position_consistency=0.0,
            tie_rate=0.0,
            total_comparisons=0,
            consistent_decisions=0,
        )

    total = len(comparisons)
    ties = sum(1 for c in comparisons if c.get("winner", "").upper() == "TIE")
    consistent = sum(
        1 for c in comparisons if c.get("position_consistent", c.get("positionConsistency", False))
    )

    return PairwiseMetrics(
        agreement_rate=consistent / total,
        position_consistency=consistent / total,
        tie_rate=ties / total,
        total_comparisons=total,
        consistent_decisions=consistent,
    )

`pearsons_r(scores1, scores2)`

Calculate Pearson's correlation coefficient.

Parameters:

Name	Type	Description	Default
`scores1`	`list[float]`	First set of scores	required
`scores2`	`list[float]`	Second set of scores	required

Returns:

Type	Description
`tuple[float, float]`	Tuple of (r, p_value)

Source code in src/boring/judge/metrics.py

def pearsons_r(scores1: list[float], scores2: list[float]) -> tuple[float, float]:
    """
    Calculate Pearson's correlation coefficient.

    Args:
        scores1: First set of scores
        scores2: Second set of scores

    Returns:
        Tuple of (r, p_value)
    """
    if len(scores1) != len(scores2):
        raise ValueError("Score lists must have same length")

    n = len(scores1)
    if n < 3:
        return 0.0, 1.0

    mean1 = sum(scores1) / n
    mean2 = sum(scores2) / n

    numerator = sum((s1 - mean1) * (s2 - mean2) for s1, s2 in zip(scores1, scores2, strict=True))
    denom1 = math.sqrt(sum((s1 - mean1) ** 2 for s1 in scores1))
    denom2 = math.sqrt(sum((s2 - mean2) ** 2 for s2 in scores2))

    if denom1 == 0 or denom2 == 0:
        return 0.0, 1.0

    r = numerator / (denom1 * denom2)

    # Approximate p-value
    if abs(r) == 1.0:
        p_value = 0.0
    else:
        t_stat = r * math.sqrt((n - 2) / (1 - r**2))
        p_value = 2 * (1 - _cdf_t(abs(t_stat), n - 2))

    return r, p_value

`precision(predictions, ground_truth)`

Calculate precision: TP / (TP + FP)

Parameters:

Name	Type	Description	Default
`predictions`	`list[int]`	List of predicted labels (1 = positive, 0 = negative)	required
`ground_truth`	`list[int]`	List of actual labels	required

Returns:

Type	Description
`float`	Precision score (0.0 to 1.0)

Source code in src/boring/judge/metrics.py

def precision(predictions: list[int], ground_truth: list[int]) -> float:
    """
    Calculate precision: TP / (TP + FP)

    Args:
        predictions: List of predicted labels (1 = positive, 0 = negative)
        ground_truth: List of actual labels

    Returns:
        Precision score (0.0 to 1.0)
    """
    if len(predictions) != len(ground_truth):
        raise ValueError("Predictions and ground truth must have same length")

    true_positives = sum(
        1 for p, g in zip(predictions, ground_truth, strict=True) if p == 1 and g == 1
    )
    predicted_positives = sum(predictions)

    return true_positives / predicted_positives if predicted_positives > 0 else 0.0

`recall(predictions, ground_truth)`

Calculate recall: TP / (TP + FN)

Parameters:

Name	Type	Description	Default
`predictions`	`list[int]`	List of predicted labels	required
`ground_truth`	`list[int]`	List of actual labels	required

Returns:

Type	Description
`float`	Recall score (0.0 to 1.0)

Source code in src/boring/judge/metrics.py

def recall(predictions: list[int], ground_truth: list[int]) -> float:
    """
    Calculate recall: TP / (TP + FN)

    Args:
        predictions: List of predicted labels
        ground_truth: List of actual labels

    Returns:
        Recall score (0.0 to 1.0)
    """
    if len(predictions) != len(ground_truth):
        raise ValueError("Predictions and ground truth must have same length")

    true_positives = sum(
        1 for p, g in zip(predictions, ground_truth, strict=True) if p == 1 and g == 1
    )
    actual_positives = sum(ground_truth)

    return true_positives / actual_positives if actual_positives > 0 else 0.0

`spearmans_rho(scores1, scores2)`

Calculate Spearman's rank correlation coefficient.

Parameters:

Name	Type	Description	Default
`scores1`	`list[float]`	First set of scores	required
`scores2`	`list[float]`	Second set of scores	required

Returns:

Type	Description
`tuple[float, float]`	Tuple of (rho, p_value)

Source code in src/boring/judge/metrics.py

def spearmans_rho(scores1: list[float], scores2: list[float]) -> tuple[float, float]:
    """
    Calculate Spearman's rank correlation coefficient.

    Args:
        scores1: First set of scores
        scores2: Second set of scores

    Returns:
        Tuple of (rho, p_value)
    """
    if len(scores1) != len(scores2):
        raise ValueError("Score lists must have same length")

    n = len(scores1)
    if n < 3:
        return 0.0, 1.0

    # Convert to ranks
    ranks1 = _rank(scores1)
    ranks2 = _rank(scores2)

    # Calculate Spearman's rho using Pearson on ranks
    mean1 = sum(ranks1) / n
    mean2 = sum(ranks2) / n

    numerator = sum((r1 - mean1) * (r2 - mean2) for r1, r2 in zip(ranks1, ranks2, strict=True))
    denom1 = math.sqrt(sum((r1 - mean1) ** 2 for r1 in ranks1))
    denom2 = math.sqrt(sum((r2 - mean2) ** 2 for r2 in ranks2))

    if denom1 == 0 or denom2 == 0:
        return 0.0, 1.0

    rho = numerator / (denom1 * denom2)

    # Approximate p-value using t-distribution
    if abs(rho) == 1.0:
        p_value = 0.0
    else:
        t_stat = rho * math.sqrt((n - 2) / (1 - rho**2))
        # Simplified p-value approximation
        p_value = 2 * (1 - _cdf_t(abs(t_stat), n - 2))

    return rho, p_value

`weighted_kappa(judge1, judge2, weights='quadratic')`

Calculate weighted Cohen's Kappa for ordinal scales.

Parameters:

Name	Type	Description	Default
`judge1`	`list[int]`	Ratings from first judge (ordinal integers)	required
`judge2`	`list[int]`	Ratings from second judge (ordinal integers)	required
`weights`	`str`	Weighting scheme - 'linear' or 'quadratic'	`'quadratic'`

Returns:

Type	Description
`float`	Weighted Kappa (-1.0 to 1.0)

Source code in src/boring/judge/metrics.py

def weighted_kappa(judge1: list[int], judge2: list[int], weights: str = "quadratic") -> float:
    """
    Calculate weighted Cohen's Kappa for ordinal scales.

    Args:
        judge1: Ratings from first judge (ordinal integers)
        judge2: Ratings from second judge (ordinal integers)
        weights: Weighting scheme - 'linear' or 'quadratic'

    Returns:
        Weighted Kappa (-1.0 to 1.0)
    """
    if len(judge1) != len(judge2):
        raise ValueError("Judge ratings must have same length")

    n = len(judge1)
    if n == 0:
        return 0.0

    # Get all unique categories (ordered)
    categories = sorted(set(judge1) | set(judge2))
    k = len(categories)

    if k < 2:
        return 1.0 if all(j1 == j2 for j1, j2 in zip(judge1, judge2, strict=True)) else 0.0

    # Create category index mapping
    cat_to_idx = {cat: i for i, cat in enumerate(categories)}

    # Calculate weight matrix
    weight_matrix = []
    for i in range(k):
        row = []
        for j in range(k):
            if weights == "linear":
                w = abs(i - j) / (k - 1)
            else:  # quadratic
                w = ((i - j) ** 2) / ((k - 1) ** 2)
            row.append(w)
        weight_matrix.append(row)

    # Count confusion matrix
    confusion = [[0] * k for _ in range(k)]
    for j1, j2 in zip(judge1, judge2, strict=True):
        i1 = cat_to_idx[j1]
        i2 = cat_to_idx[j2]
        confusion[i1][i2] += 1

    # Calculate observed disagreement
    observed = sum(weight_matrix[i][j] * confusion[i][j] for i in range(k) for j in range(k)) / n

    # Calculate expected disagreement
    row_marginals = [sum(confusion[i]) / n for i in range(k)]
    col_marginals = [sum(confusion[i][j] for i in range(k)) / n for j in range(k)]

    expected = sum(
        weight_matrix[i][j] * row_marginals[i] * col_marginals[j]
        for i in range(k)
        for j in range(k)
    )

    # Calculate weighted kappa
    if expected == 0:
        return 1.0 if observed == 0 else 0.0

    return 1 - (observed / expected)

`format_rubric_json(rubric)`

Convert rubric to JSON-serializable dict.

Source code in src/boring/judge/rubric_generator.py

def format_rubric_json(rubric: DetailedRubric) -> dict:
    """Convert rubric to JSON-serializable dict."""
    return {
        "name": rubric.name,
        "description": rubric.description,
        "domain": rubric.domain,
        "scale": rubric.scale,
        "strictness": rubric.strictness,
        "criteria": [
            {
                "name": c.name,
                "description": c.description,
                "weight": c.weight,
                "levels": [
                    {
                        "score": level.score,
                        "label": level.label,
                        "description": level.description,
                        "characteristics": level.characteristics,
                    }
                    for level in c.levels
                ],
            }
            for c in rubric.criteria
        ],
        "edge_cases": [
            {"situation": e.situation, "guidance": e.guidance} for e in rubric.general_edge_cases
        ],
        "scoring_guidelines": rubric.scoring_guidelines,
    }

`generate_code_quality_rubric(strictness='balanced')`

Generate a standard code quality rubric.

Source code in src/boring/judge/rubric_generator.py

def generate_code_quality_rubric(strictness: str = "balanced") -> DetailedRubric:
    """Generate a standard code quality rubric."""
    return generate_rubric(
        name="Code Quality",
        description="Comprehensive evaluation of code quality",
        domain="code_quality",
        criteria_names=["Readability", "Documentation", "Modularity", "Error Handling"],
        scale="1-5",
        strictness=strictness,
        weights={
            "Readability": 1.2,
            "Documentation": 0.8,
            "Modularity": 1.0,
            "Error Handling": 1.0,
        },
    )

`generate_rubric(name, description, domain, criteria_names, scale='1-5', strictness='balanced', weights=None)`

Generate a detailed rubric with level descriptions.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the rubric	required
`description`	`str`	Description of what this rubric evaluates	required
`domain`	`str`	Domain for level templates (code_quality, security, performance, documentation)	required
`criteria_names`	`list[str]`	List of criterion names	required
`scale`	`str`	Rating scale (1-3, 1-5, 1-10)	`'1-5'`
`strictness`	`str`	Strictness level (lenient, balanced, strict)	`'balanced'`
`weights`	`dict[str, float] \| None`	Optional weights for each criterion	`None`

Returns:

Type	Description
`DetailedRubric`	DetailedRubric with complete level descriptions

Source code in src/boring/judge/rubric_generator.py

def generate_rubric(
    name: str,
    description: str,
    domain: str,
    criteria_names: list[str],
    scale: str = "1-5",
    strictness: str = "balanced",
    weights: dict[str, float] | None = None,
) -> DetailedRubric:
    """
    Generate a detailed rubric with level descriptions.

    Args:
        name: Name of the rubric
        description: Description of what this rubric evaluates
        domain: Domain for level templates (code_quality, security, performance, documentation)
        criteria_names: List of criterion names
        scale: Rating scale (1-3, 1-5, 1-10)
        strictness: Strictness level (lenient, balanced, strict)
        weights: Optional weights for each criterion

    Returns:
        DetailedRubric with complete level descriptions
    """
    weights = weights or {}

    # Get level template for domain
    levels_template = LEVEL_TEMPLATES.get(domain, LEVEL_TEMPLATES["code_quality"])

    # Generate criteria with levels
    criteria = []
    for criterion_name in criteria_names:
        # Build levels based on scale
        levels = []
        if scale == "1-5":
            for score in [1, 2, 3, 4, 5]:
                template = levels_template.get(
                    score, {"label": f"Level {score}", "description": "", "characteristics": []}
                )
                levels.append(
                    RubricLevel(
                        score=score,
                        label=template["label"],
                        description=template["description"],
                        characteristics=template["characteristics"],
                    )
                )
        elif scale == "1-3":
            for score, template_score in [(1, 1), (2, 3), (3, 5)]:
                template = levels_template.get(
                    template_score,
                    {"label": f"Level {score}", "description": "", "characteristics": []},
                )
                levels.append(
                    RubricLevel(
                        score=score,
                        label=template["label"],
                        description=template["description"],
                        characteristics=template["characteristics"],
                    )
                )
        elif scale == "1-10":
            for score in range(1, 11):
                # Map to 1-5 templates
                template_score = min(5, max(1, (score + 1) // 2))
                template = levels_template.get(
                    template_score,
                    {"label": f"Level {score}", "description": "", "characteristics": []},
                )
                levels.append(
                    RubricLevel(
                        score=score,
                        label=f"{template['label']} ({score}/10)",
                        description=template["description"],
                        characteristics=template["characteristics"],
                    )
                )

        criterion = DetailedCriterion(
            name=criterion_name,
            description=f"Evaluate {criterion_name.lower()}",
            weight=weights.get(criterion_name, 1.0),
            levels=levels,
            edge_cases=EDGE_CASE_TEMPLATES.get(domain, []),
        )
        criteria.append(criterion)

    return DetailedRubric(
        name=name,
        description=description,
        domain=domain,
        scale=scale,
        strictness=strictness,
        criteria=criteria,
        general_edge_cases=EDGE_CASE_TEMPLATES.get(domain, []),
        scoring_guidelines=SCORING_GUIDELINES.get(strictness, SCORING_GUIDELINES["balanced"]),
    )

`generate_security_rubric(strictness='strict')`

Generate a security-focused rubric.

Source code in src/boring/judge/rubric_generator.py

def generate_security_rubric(strictness: str = "strict") -> DetailedRubric:
    """Generate a security-focused rubric."""
    return generate_rubric(
        name="Security Audit",
        description="Security vulnerability assessment",
        domain="security",
        criteria_names=["Secrets Management", "Input Validation", "Injection Prevention"],
        scale="1-5",
        strictness=strictness,
        weights={"Secrets Management": 2.0, "Input Validation": 1.5, "Injection Prevention": 1.5},
    )

`rubric_to_prompt(rubric)`

Convert a DetailedRubric to a prompt string for LLM evaluation.

Parameters:

Name	Type	Description	Default
`rubric`	`DetailedRubric`	The rubric to convert	required

Returns:

Type	Description
`str`	Formatted prompt string

Source code in src/boring/judge/rubric_generator.py

def rubric_to_prompt(rubric: DetailedRubric) -> str:
    """
    Convert a DetailedRubric to a prompt string for LLM evaluation.

    Args:
        rubric: The rubric to convert

    Returns:
        Formatted prompt string
    """
    lines = [f"# {rubric.name}", "", rubric.description, ""]

    lines.append("## Criteria")
    lines.append("")
    for criterion in rubric.criteria:
        lines.append(f"### {criterion.name} (Weight: {criterion.weight})")
        lines.append("")
        for level in criterion.levels:
            lines.append(f"**{level.score} - {level.label}**: {level.description}")
            if level.characteristics:
                for char in level.characteristics:
                    lines.append(f"  - {char}")
        lines.append("")

    if rubric.general_edge_cases:
        lines.append("## Edge Cases")
        lines.append("")
        for ec in rubric.general_edge_cases:
            lines.append(f"- **{ec.situation}**: {ec.guidance}")
        lines.append("")

    if rubric.scoring_guidelines:
        lines.append("## Scoring Guidelines")
        lines.append("")
        for guideline in rubric.scoring_guidelines:
            lines.append(f"- {guideline}")
        lines.append("")

    return "\n".join(lines)

Judge API 參考 (Evaluation)

boring.judge

BiasMonitor

__init__(project_root)

record_pairwise_evaluation(evaluation_id, winner, first_position, position_consistent, confidence=0.0, response_a_length=0, response_b_length=0)

record_direct_evaluation(evaluation_id, score, response_length, dimension_scores=None)

detect_position_bias(days=30)

detect_length_bias(days=30)

get_bias_report(days=30)

clear_old_data(days=90)

BiasReport dataclass

LengthBiasResult dataclass

PositionBiasResult dataclass

LLMJudge

grade_code(filename, content, rubric=CODE_QUALITY_RUBRIC, interactive=False)

compare_plans(plan_a, plan_b, context, interactive=False)

compare_code(name_a, code_a, name_b, code_b, context=None, interactive=False)

calibrate_confidence(raw_confidence, position_consistent, evidence_count=0)

length_normalized_score(score, response_length, target_length=500, max_penalty=0.5)

get_bias_report(days=30)

DetailedCriterion dataclass

DetailedRubric dataclass

EdgeCase dataclass

RubricLevel dataclass

format_bias_report(report)

get_bias_monitor(project_root)

create_judge_provider()

agreement_metrics(judge1, judge2, ordinal=False)

classification_metrics(predictions, ground_truth)

cohens_kappa(judge1, judge2)

correlation_metrics(scores1, scores2)

f1_score(predictions, ground_truth)

format_metrics_report(report)

generate_metrics_report(automated_scores=None, human_scores=None, predictions=None, ground_truth=None, pairwise_comparisons=None, evaluation_type='general')

kendalls_tau(scores1, scores2)

pairwise_metrics(comparisons)

pearsons_r(scores1, scores2)

precision(predictions, ground_truth)

recall(predictions, ground_truth)

spearmans_rho(scores1, scores2)

weighted_kappa(judge1, judge2, weights='quadratic')

format_rubric_json(rubric)

generate_code_quality_rubric(strictness='balanced')

generate_rubric(name, description, domain, criteria_names, scale='1-5', strictness='balanced', weights=None)

generate_security_rubric(strictness='strict')

rubric_to_prompt(rubric)

`boring.judge`

`BiasMonitor`

`init(project_root)`

`record_pairwise_evaluation(evaluation_id, winner, first_position, position_consistent, confidence=0.0, response_a_length=0, response_b_length=0)`

`record_direct_evaluation(evaluation_id, score, response_length, dimension_scores=None)`

`detect_position_bias(days=30)`

`detect_length_bias(days=30)`

`get_bias_report(days=30)`

`clear_old_data(days=90)`

`BiasReport` `dataclass`

`LengthBiasResult` `dataclass`

`PositionBiasResult` `dataclass`

`LLMJudge`

`grade_code(filename, content, rubric=CODE_QUALITY_RUBRIC, interactive=False)`

`compare_plans(plan_a, plan_b, context, interactive=False)`

`compare_code(name_a, code_a, name_b, code_b, context=None, interactive=False)`

`calibrate_confidence(raw_confidence, position_consistent, evidence_count=0)`

`length_normalized_score(score, response_length, target_length=500, max_penalty=0.5)`

`get_bias_report(days=30)`

`DetailedCriterion` `dataclass`

`DetailedRubric` `dataclass`

`EdgeCase` `dataclass`

`RubricLevel` `dataclass`

`format_bias_report(report)`

`get_bias_monitor(project_root)`

`create_judge_provider()`

`agreement_metrics(judge1, judge2, ordinal=False)`

`classification_metrics(predictions, ground_truth)`

`cohens_kappa(judge1, judge2)`

`correlation_metrics(scores1, scores2)`

`f1_score(predictions, ground_truth)`

`format_metrics_report(report)`

`generate_metrics_report(automated_scores=None, human_scores=None, predictions=None, ground_truth=None, pairwise_comparisons=None, evaluation_type='general')`

`kendalls_tau(scores1, scores2)`

`pairwise_metrics(comparisons)`

`pearsons_r(scores1, scores2)`

`precision(predictions, ground_truth)`

`recall(predictions, ground_truth)`

`spearmans_rho(scores1, scores2)`

`weighted_kappa(judge1, judge2, weights='quadratic')`

`format_rubric_json(rubric)`

`generate_code_quality_rubric(strictness='balanced')`

`generate_rubric(name, description, domain, criteria_names, scale='1-5', strictness='balanced', weights=None)`

`generate_security_rubric(strictness='strict')`

`rubric_to_prompt(rubric)`