Name: Ailog - RAG as a Service Platform
Availability: InStock
Rating: 4.8 (156 reviews)

TL;DR

Tabellen enthalten oft die wichtigsten Informationen (Preise, Spezifikationen, Vergleiche)
Problem : klassische Parser zerstören die Struktur
Lösungen : Erkennung + spezialisierte Extraktion + intelligente Serialisierung
Werkzeuge : Unstructured, Camelot, Tabula, multimodale LLMs
Laden Sie Ihre PDFs mit Tabellen hoch auf Ailog

Warum Tabellen problematisch sind

Exemple typique de destruction de tableau :

PDF Original:
┌──────────┬─────────┬──────────┐
│ Produit  │ Prix    │ Stock    │
├──────────┼─────────┼──────────┤
│ Widget A │ 99€     │ En stock │
│ Widget B │ 149€    │ Rupture  │
└──────────┴─────────┴──────────┘

Après parsing naïf:
"Produit Prix Stock Widget A 99€ En stock Widget B 149€ Rupture"

→ Structure perdue, relations brisées

→ Struktur verloren, Beziehungen zerstört

Erkennung von Tabellen

Mit Unstructured

DEVELOPERpython
from unstructured.partition.pdf import partition_pdf

def extract_with_table_detection(pdf_path: str) -> dict:
    """
    Extrahiert PDF-Inhalte mit Tabellenerkennung.
    """
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",  # Visuelle Erkennung
        infer_table_structure=True,
        include_page_breaks=True
    )

    tables = []
    text_content = []

    for element in elements:
        if element.category == "Table":
            tables.append({
                "html": element.metadata.text_as_html,
                "text": element.text,
                "page": element.metadata.page_number
            })
        else:
            text_content.append(element.text)

    return {
        "tables": tables,
        "text": "\n".join(text_content)
    }

Mit Camelot (Native PDFs)

DEVELOPERpython
import camelot

def extract_tables_camelot(pdf_path: str) -> list:
    """
    Extrahiert Tabellen mit Camelot.
    Funktioniert gut bei nativen PDFs (nicht gescannt).
    """
    # 'lattice'-Methode für Tabellen mit Rahmen
    tables = camelot.read_pdf(
        pdf_path,
        pages='all',
        flavor='lattice'  # oder 'stream' für Tabellen ohne Rahmen
    )

    extracted = []
    for i, table in enumerate(tables):
        df = table.df

        extracted.append({
            "table_id": i,
            "page": table.page,
            "accuracy": table.accuracy,
            "dataframe": df,
            "html": df.to_html(),
            "markdown": df.to_markdown()
        })

    return extracted

Visuelle Erkennung (multimodale LLMs)

DEVELOPERpython
import anthropic
import base64

def detect_tables_vision(image_path: str) -> dict:
    """
    Verwendet Claude Vision, um Tabellen zu erkennen und zu extrahieren.
    """
    client = anthropic.Anthropic()

    with open(image_path, "rb") as f:
        image_data = base64.standard_b64encode(f.read()).decode("utf-8")

    response = client.messages.create(
        model="claude-3-5-sonnet-latest",
        max_tokens=4096,
        messages=[{
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/png",
                        "data": image_data
                    }
                },
                {
                    "type": "text",
                    "text": """Extract all tables from this image.
For each table:
1. Output as markdown table
2. Preserve headers
3. Keep all data exactly as shown

Format:
TABLE 1:
| Header1 | Header2 | ... |
|---------|---------|-----|
| data    | data    | ... |

TABLE 2:
..."""
                }
            ]
        }]
    )

    return {
        "extracted_tables": response.content[0].text
    }

Serialisierung von Tabellen

Markdown-Format

DEVELOPERpython
def table_to_markdown(df) -> str:
    """
    Konvertiert ein DataFrame in sauberes Markdown.
    """
    return df.to_markdown(index=False)

# Ergebnis:
# | Produkt  | Preis | Lagerbestand |
# |----------|-------|--------------|
# | Widget A | 99€   | Auf Lager    |
# | Widget B | 149€  | Ausverkauft  |

Zeilenbasiertes Format (Besser für RAG)

DEVELOPERpython
def table_to_row_format(df, table_context: str = "") -> list:
    """
    Wandelt jede Zeile in eigenständigen Text um.
    Jede Zeile wird zu einem autonomen Chunk.
    """
    headers = df.columns.tolist()
    rows_as_text = []

    for _, row in df.iterrows():
        row_text = "; ".join([
            f"{header}: {value}"
            for header, value in zip(headers, row.values)
        ])

        if table_context:
            row_text = f"{table_context} - {row_text}"

        rows_as_text.append(row_text)

    return rows_as_text

# Ergebnis:
# ["Produktkatalog - Produit: Widget A; Prix: 99€; Stock: En stock",
#  "Produktkatalog - Produit: Widget B; Prix: 149€; Stock: Rupture"]

Q&A-Format (Optimal für Retrieval)

DEVELOPERpython
def table_to_qa_pairs(df, table_title: str) -> list:
    """
    Generiert Q&A-Paare aus der Tabelle.
    Verbessert das Retrieval signifikant.
    """
    headers = df.columns.tolist()
    qa_pairs = []

    for _, row in df.iterrows():
        # Identifiziere die "Schlüssel"-Spalte (häufig die erste)
        key_col = headers[0]
        key_val = row[key_col]

        for header in headers[1:]:
            value = row[header]
            if pd.notna(value) and str(value).strip():
                qa_pairs.append({
                    "question": f"Quel est le {header.lower()} de {key_val} ?",
                    "answer": f"Le {header.lower()} de {key_val} est {value}.",
                    "source": table_title
                })

    return qa_pairs

# Ergebnis:
# [{"question": "Quel est le prix de Widget A ?",
#   "answer": "Le prix de Widget A est 99€.",
#   "source": "Catalogue Produits"},
#  {"question": "Quel est le stock de Widget A ?",
#   "answer": "Le stock de Widget A est En stock.",
#   "source": "Catalogue Produits"}]

Chunking von Tabellen

Kleine Tabellen (< 20 Zeilen)

Den ganzen Tabelle als einen Chunk behalten:

DEVELOPERpython
def chunk_small_table(df, metadata: dict) -> dict:
    """
    Kleine Tabelle = ein einziger Chunk mit Kontext.
    """
    markdown = df.to_markdown(index=False)

    chunk = {
        "content": f"**{metadata['title']}**\n\n{markdown}",
        "metadata": {
            "type": "table",
            "rows": len(df),
            "columns": list(df.columns),
            **metadata
        }
    }

    return chunk

Mittlere Tabellen (20-100 Zeilen)

Chunking in Gruppen von Zeilen mit Überlappung:

DEVELOPERpython
def chunk_medium_table(
    df,
    metadata: dict,
    rows_per_chunk: int = 10,
    overlap: int = 2
) -> list:
    """
    Chunking in Gruppen von Zeilen mit wiederholten Headern.
    """
    chunks = []
    headers = df.columns.tolist()
    header_row = "| " + " | ".join(headers) + " |"
    separator = "| " + " | ".join(["---"] * len(headers)) + " |"

    for i in range(0, len(df), rows_per_chunk - overlap):
        subset = df.iloc[i:i + rows_per_chunk]

        if len(subset) == 0:
            continue

        rows_md = subset.to_markdown(index=False).split('\n')[2:]  # Skip header

        chunk_md = (
            f"**{metadata['title']}** (lignes {i+1}-{i+len(subset)})\n\n"
            f"{header_row}\n{separator}\n" +
            "\n".join(rows_md)
        )

        chunks.append({
            "content": chunk_md,
            "metadata": {
                "type": "table_chunk",
                "start_row": i + 1,
                "end_row": i + len(subset),
                **metadata
            }
        })

    return chunks

Große Tabellen (> 100 Zeilen)

Konvertierung in zeilenbasiertes Format:

DEVELOPERpython
def chunk_large_table(df, metadata: dict) -> list:
    """
    Große Tabellen: jede Zeile wird zu einem Chunk.
    """
    return [
        {
            "content": table_to_row_format(df.iloc[[i]], metadata['title'])[0],
            "metadata": {
                "type": "table_row",
                "row_index": i + 1,
                "primary_key": str(df.iloc[i, 0]),  # Erste Spalte als Schlüssel
                **metadata
            }
        }
        for i in range(len(df))
    ]

Kontextanreicherung

Umgebenden Kontext hinzufügen

DEVELOPERpython
def enrich_table_context(
    table_html: str,
    surrounding_text: str,
    llm_client
) -> dict:
    """
    Nutzt das LLM, um den Kontext der Tabelle anzureichern.
    """
    prompt = f"""Analyze this table and its surrounding context.

Surrounding text:
{surrounding_text[:500]}

Table (HTML):
{table_html}

Generate:
1. A descriptive title for the table
2. A one-sentence summary of what the table shows
3. The key columns and what they represent

Output as JSON:
{{"title": "...", "summary": "...", "key_columns": [{{"name": "...", "description": "..."}}]}}"""

    result = llm_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )

    import json
    return json.loads(result.choices[0].message.content)

Zusammenfassungen erstellen

DEVELOPERpython
def summarize_table(df, llm_client) -> str:
    """
    Generiert eine textuelle Zusammenfassung der Tabelle.
    """
    # Grundlegende Statistiken
    stats = {
        "rows": len(df),
        "columns": list(df.columns),
        "sample": df.head(3).to_markdown()
    }

    prompt = f"""Summarize this table in 2-3 sentences.

Columns: {stats['columns']}
Rows: {stats['rows']}
Sample:
{stats['sample']}

Summary:"""

    result = llm_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=100,
        temperature=0
    )

    return result.choices[0].message.content.strip()

Vollständige Pipeline

DEVELOPERpython
class TableProcessor:
    def __init__(self, llm_client=None):
        self.llm = llm_client

    def process_document(self, pdf_path: str) -> dict:
        """
        Vollständige Pipeline zur Extraktion und zum Chunking von Tabellen.
        """
        # 1. Extraktion
        raw = extract_with_table_detection(pdf_path)

        processed_tables = []

        for i, table in enumerate(raw["tables"]):
            # 2. In DataFrame konvertieren
            df = self._html_to_df(table["html"])

            if df is None or df.empty:
                continue

            # 3. Kontext anreichern
            if self.llm:
                context = enrich_table_context(
                    table["html"],
                    raw["text"][:500],
                    self.llm
                )
            else:
                context = {"title": f"Table {i+1}", "summary": ""}

            # 4. Chunking je nach Größe
            if len(df) <= 20:
                chunks = [chunk_small_table(df, context)]
            elif len(df) <= 100:
                chunks = chunk_medium_table(df, context)
            else:
                chunks = chunk_large_table(df, context)

            # 5. Generiere außerdem Q&A-Paare
            qa_pairs = table_to_qa_pairs(df, context["title"])

            processed_tables.append({
                "table_id": i,
                "metadata": context,
                "chunks": chunks,
                "qa_pairs": qa_pairs,
                "row_count": len(df)
            })

        return {
            "text_chunks": self._chunk_text(raw["text"]),
            "table_chunks": processed_tables,
            "stats": {
                "tables_found": len(raw["tables"]),
                "tables_processed": len(processed_tables)
            }
        }

    def _html_to_df(self, html: str):
        """Konvertiert HTML in ein DataFrame."""
        import pandas as pd
        try:
            dfs = pd.read_html(html)
            return dfs[0] if dfs else None
        except:
            return None

    def _chunk_text(self, text: str) -> list:
        """Chunked den Standardtext."""
        # Implementierung des Standard-Chunkings
        pass

# Verwendung
processor = TableProcessor(llm_client=openai_client)
result = processor.process_document("rapport.pdf")

# Chunks indexieren
for table in result["table_chunks"]:
    for chunk in table["chunks"]:
        vector_db.upsert(chunk)

    # Bonus: Q&A-Paare für besseres Retrieval indexieren
    for qa in table["qa_pairs"]:
        vector_db.upsert({
            "content": f"Q: {qa['question']}\nA: {qa['answer']}",
            "metadata": {"type": "table_qa", "source": qa["source"]}
        })

Benchmarks

Methode	Genauigkeit	Komplexe Tabellen	Latenz
PyPDF2	20%	Scheitert	50ms
Camelot (lattice)	85%	Gut	200ms
Unstructured	80%	Durchschnittlich	500ms
Claude Vision	95%	Ausgezeichnet	2s
GPT-4o Vision	93%	Ausgezeichnet	1.5s

Extraktion und Verarbeitung von Tabellen für RAG