Name: Ailog - RAG as a Service Platform
Availability: InStock
Rating: 4.8 (156 reviews)

TL;DR

LLM reranking = using GPT-4/Claude to score result relevance
Advantage: superior semantic understanding vs cross-encoders
Disadvantage: 10-100x slower and more expensive
Use cases: complex queries, specialized domains, high-value
Test different reranking strategies on Ailog

Why Use an LLM for Reranking?

Cross-encoders (BERT, Cohere Rerank) are fast but limited:

Trained on general data
Poor understanding of domain-specific nuances
Binary scoring (relevant/not relevant)

LLMs bring:

Reasoning: can explain why a document is relevant
Contextuality: understand query nuances
Flexibility: adapt to any domain without fine-tuning

Basic Implementation

LLM Scoring

DEVELOPERpython
from openai import OpenAI

client = OpenAI()

def llm_rerank(query: str, documents: list, top_k: int = 3) -> list:
    """
    Reranks documents using an LLM.
    """
    scored_docs = []

    for doc in documents:
        prompt = f"""Rate the relevance of this document to the query.

Query: {query}

Document: {doc['content'][:1500]}

Rate from 0-10 where:
- 0: Completely irrelevant
- 5: Partially relevant
- 10: Highly relevant and directly answers the query

Output ONLY a number between 0 and 10."""

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=5,
            temperature=0
        )

        try:
            score = float(response.choices[0].message.content.strip())
        except ValueError:
            score = 5.0  # Default score

        scored_docs.append({
            **doc,
            "relevance_score": score
        })

    # Sort by descending score
    scored_docs.sort(key=lambda x: x["relevance_score"], reverse=True)

    return scored_docs[:top_k]

With Explanation

DEVELOPERpython
def llm_rerank_with_reasoning(query: str, documents: list, top_k: int = 3) -> list:
    """
    Reranks with score explanation.
    """
    prompt = f"""You are a relevance judge. Rate each document's relevance to the query.

Query: {query}

Documents:
"""

    for i, doc in enumerate(documents):
        prompt += f"\n[Doc {i+1}]: {doc['content'][:500]}...\n"

    prompt += """
For each document, output:
- Document number
- Relevance score (0-10)
- One sentence explaining why

Format:
Doc 1: 8/10 - Directly addresses the main question about...
Doc 2: 3/10 - Only tangentially related to...
"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    # Parse response
    result = parse_ranking_response(response.choices[0].message.content)

    return result[:top_k]

Pairwise Comparison Approach

More robust than absolute scoring:

DEVELOPERpython
def pairwise_llm_rerank(query: str, documents: list, top_k: int = 3) -> list:
    """
    Compares documents pairwise for more accurate ranking.
    """
    n = len(documents)
    wins = {i: 0 for i in range(n)}

    # Compare each pair
    for i in range(n):
        for j in range(i + 1, n):
            winner = compare_pair(query, documents[i], documents[j])
            wins[winner] += 1

    # Sort by number of wins
    ranked_indices = sorted(wins.keys(), key=lambda x: wins[x], reverse=True)

    return [documents[i] for i in ranked_indices[:top_k]]

def compare_pair(query: str, doc_a: dict, doc_b: dict) -> int:
    """
    Compares two documents and returns the index of the more relevant one.
    """
    prompt = f"""Which document is more relevant to this query?

Query: {query}

Document A: {doc_a['content'][:800]}

Document B: {doc_b['content'][:800]}

Answer with only "A" or "B"."""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=1,
        temperature=0
    )

    answer = response.choices[0].message.content.strip().upper()
    return 0 if answer == "A" else 1

Cost Optimization

Batch Processing with Claude

DEVELOPERpython
import anthropic

client = anthropic.Anthropic()

def batch_llm_rerank(query: str, documents: list, top_k: int = 5) -> list:
    """
    Reranks all documents in a single LLM call.
    """
    docs_text = "\n\n".join([
        f"[{i+1}] {doc['content'][:600]}"
        for i, doc in enumerate(documents)
    ])

    prompt = f"""Rank these documents by relevance to the query.

Query: {query}

Documents:
{docs_text}

Return ONLY the document numbers in order of relevance, comma-separated.
Example: 3,1,5,2,4"""

    response = client.messages.create(
        model="claude-3-5-haiku-latest",
        max_tokens=50,
        messages=[{"role": "user", "content": prompt}]
    )

    # Parse order
    order_str = response.content[0].text.strip()
    order = [int(x.strip()) - 1 for x in order_str.split(",")]

    # Reorder documents
    reranked = [documents[i] for i in order if i < len(documents)]

    return reranked[:top_k]

Hybrid Strategy: Cross-Encoder + LLM

DEVELOPERpython
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def hybrid_rerank(query: str, documents: list, top_k: int = 3) -> list:
    """
    1. Fast cross-encoder for filtering (top 10)
    2. LLM for final ranking (top 3)
    """
    # Step 1: Cross-encoder (fast)
    pairs = [(query, doc['content']) for doc in documents]
    scores = cross_encoder.predict(pairs)

    # Top 10 by cross-encoder
    top_indices = scores.argsort()[-10:][::-1]
    candidates = [documents[i] for i in top_indices]

    # Step 2: LLM to refine (expensive but on fewer docs)
    final_ranking = batch_llm_rerank(query, candidates, top_k)

    return final_ranking

LLM Reranking with Domain Context

Specialized Prompt

DEVELOPERpython
def domain_specific_rerank(
    query: str,
    documents: list,
    domain: str,
    top_k: int = 3
) -> list:
    """
    Reranking with domain-specific context.
    """
    domain_context = {
        "legal": """You are a legal research expert. Prioritize:
- Exact legal citations and case law
- Jurisdictional relevance
- Recency of legal precedents""",

        "medical": """You are a medical research expert. Prioritize:
- Clinical evidence and study quality
- Patient safety considerations
- Guideline compliance""",

        "ecommerce": """You are an e-commerce product expert. Prioritize:
- Product specification matches
- Price and availability relevance
- User intent (browse vs. buy)"""
    }

    context = domain_context.get(domain, "You are a relevance expert.")

    prompt = f"""{context}

Query: {query}

Rank these documents by relevance:
{format_documents(documents)}

Return document numbers in order of relevance."""

    # ... LLM call

Comparison of Approaches

Method	Latency	Cost	Quality	Use Case
Cross-encoder	~50ms	Free	Good	General use
Cohere Rerank	~100ms	$1/1K req	Very good	Production
GPT-4o-mini	~500ms	$0.15/1K	Excellent	Specialized domains
GPT-4o	~1s	$2.50/1K	Best	High value
Claude Haiku	~300ms	$0.25/1K	Very good	Good value ratio

Approximate costs for reranking 10 documents

When to Use LLM Reranking

Use it when:

Complex or multi-hop queries
Highly specialized domain without training data
High value per query (legal, medical, finance)
Need explanations for ranking
Cross-encoders are insufficient

Avoid when:

High volume (> 1000 req/day)
Critical latency (< 200ms required)
Limited budget
Simple, direct queries

Metrics and Evaluation

DEVELOPERpython
def evaluate_reranking(
    queries: list,
    ground_truth: dict,
    rerank_fn: callable
) -> dict:
    """
    Evaluates reranking quality.
    """
    metrics = {
        "mrr": [],  # Mean Reciprocal Rank
        "ndcg@3": [],  # Normalized DCG
        "precision@1": []
    }

    for query in queries:
        # Retrieve candidates
        candidates = retrieve(query, k=20)

        # Rerank
        reranked = rerank_fn(query, candidates)

        # Calculate metrics
        relevant_docs = ground_truth[query]

        # MRR
        for i, doc in enumerate(reranked):
            if doc['id'] in relevant_docs:
                metrics["mrr"].append(1 / (i + 1))
                break
        else:
            metrics["mrr"].append(0)

        # Precision@1
        if reranked and reranked[0]['id'] in relevant_docs:
            metrics["precision@1"].append(1)
        else:
            metrics["precision@1"].append(0)

    return {k: sum(v)/len(v) for k, v in metrics.items()}

Complete Example

DEVELOPERpython
class LLMReranker:
    def __init__(self, model: str = "gpt-4o-mini", domain: str = None):
        self.model = model
        self.domain = domain
        self.client = OpenAI()

    def rerank(
        self,
        query: str,
        documents: list,
        top_k: int = 5,
        use_hybrid: bool = True
    ) -> list:
        """
        Complete reranking pipeline.
        """
        # Step 1: Pre-filtering if many documents
        if use_hybrid and len(documents) > 15:
            documents = self._cross_encoder_filter(query, documents, k=15)

        # Step 2: LLM reranking
        reranked = self._llm_rank(query, documents)

        return reranked[:top_k]

    def _cross_encoder_filter(self, query, docs, k):
        encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
        pairs = [(query, d['content']) for d in docs]
        scores = encoder.predict(pairs)
        top_idx = scores.argsort()[-k:][::-1]
        return [docs[i] for i in top_idx]

    def _llm_rank(self, query, docs):
        # ... batch_llm_rerank implementation
        pass

# Usage
reranker = LLMReranker(model="gpt-4o-mini", domain="legal")
results = reranker.rerank(query, candidates, top_k=5)

Related Guides

Reranking:

Reranking for RAG - Reranking overview
Cross-Encoder Reranking - Fast approach
Cohere Rerank API - Managed solution

Retrieval:

Retrieval Strategies - Advanced techniques
RAG Cost Optimization - Reduce costs

Is LLM reranking right for your use case? Let's evaluate your pipeline together →

LLM Reranking: Using LLMs to Reorder Your Results