Skip to content

Extraction Functions

Kreuzberg provides both async and sync functions for text extraction. All functions accept an optional ExtractionConfig parameter for configuring the extraction process.

Asynchronous Functions

These functions return awaitable coroutines that must be awaited or run in an asyncio event loop.

extract_file

Extract text from a file path:

kreuzberg.extract_file(file_path: PathLike[str] | str, mime_type: str | None = None, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult async

Extract the textual content from a given file.

PARAMETER DESCRIPTION
file_path

The path to the file.

TYPE: PathLike[str] | str

mime_type

The mime type of the content.

TYPE: str | None DEFAULT: None

config

Extraction options object, defaults to the default object.

TYPE: ExtractionConfig DEFAULT: DEFAULT_CONFIG

RETURNS DESCRIPTION
ExtractionResult

The extracted content and the mime type of the content.

Source code in kreuzberg/extraction.py
async def extract_file(
    file_path: PathLike[str] | str, mime_type: str | None = None, config: ExtractionConfig = DEFAULT_CONFIG
) -> ExtractionResult:
    """Extract the textual content from a given file.

    Args:
        file_path: The path to the file.
        mime_type: The mime type of the content.
        config: Extraction options object, defaults to the default object.

    Returns:
        The extracted content and the mime type of the content.
    """
    mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
    if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
        result = await extractor.extract_path_async(Path(file_path))
    else:
        result = ExtractionResult(
            content=safe_decode(await anyio.Path(file_path).read_bytes()), chunks=[], mime_type=mime_type, metadata={}
        )

    return await _validate_and_post_process_async(result=result, config=config)

extract_bytes

Extract text from raw bytes:

kreuzberg.extract_bytes(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult async

Extract the textual content from a given byte string representing a file's contents.

PARAMETER DESCRIPTION
content

The content to extract.

TYPE: bytes

mime_type

The mime type of the content.

TYPE: str

config

Extraction options object, defaults to the default object.

TYPE: ExtractionConfig DEFAULT: DEFAULT_CONFIG

RETURNS DESCRIPTION
ExtractionResult

The extracted content and the mime type of the content.

Source code in kreuzberg/extraction.py
async def extract_bytes(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult:
    """Extract the textual content from a given byte string representing a file's contents.

    Args:
        content: The content to extract.
        mime_type: The mime type of the content.
        config: Extraction options object, defaults to the default object.


    Returns:
        The extracted content and the mime type of the content.
    """
    mime_type = validate_mime_type(mime_type=mime_type)
    if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
        result = await extractor.extract_bytes_async(content)
    else:
        result = ExtractionResult(
            content=safe_decode(content),
            chunks=[],
            mime_type=mime_type,
            metadata={},
        )

    return await _validate_and_post_process_async(result=result, config=config)

batch_extract_file

Process multiple files concurrently:

kreuzberg.batch_extract_file(file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG) -> list[ExtractionResult] async

Extract text from multiple files concurrently.

PARAMETER DESCRIPTION
file_paths

A sequence of paths to files to extract text from.

TYPE: Sequence[PathLike[str] | str]

config

Extraction options object, defaults to the default object.

TYPE: ExtractionConfig DEFAULT: DEFAULT_CONFIG

RETURNS DESCRIPTION
list[ExtractionResult]

A list of extraction results in the same order as the input paths.

Source code in kreuzberg/extraction.py
async def batch_extract_file(
    file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG
) -> list[ExtractionResult]:
    """Extract text from multiple files concurrently.

    Args:
        file_paths: A sequence of paths to files to extract text from.
        config: Extraction options object, defaults to the default object.

    Returns:
        A list of extraction results in the same order as the input paths.
    """
    results = cast("list[ExtractionResult]", ([None] * len(file_paths)))

    async def _extract_file(path: PathLike[str] | str, index: int) -> None:
        result = await extract_file(
            path,
            None,
            config,
        )
        results[index] = result

    async with anyio.create_task_group() as tg:
        for i, path in enumerate(file_paths):
            tg.start_soon(_extract_file, path, i)

    return results

batch_extract_bytes

Process multiple byte contents concurrently:

kreuzberg.batch_extract_bytes(contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG) -> list[ExtractionResult] async

Extract text from multiple byte contents concurrently.

PARAMETER DESCRIPTION
contents

A sequence of tuples containing (content, mime_type) pairs.

TYPE: Sequence[tuple[bytes, str]]

config

Extraction options object, defaults to the default object.

TYPE: ExtractionConfig DEFAULT: DEFAULT_CONFIG

RETURNS DESCRIPTION
list[ExtractionResult]

A list of extraction results in the same order as the input contents.

Source code in kreuzberg/extraction.py
async def batch_extract_bytes(
    contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG
) -> list[ExtractionResult]:
    """Extract text from multiple byte contents concurrently.

    Args:
        contents: A sequence of tuples containing (content, mime_type) pairs.
        config: Extraction options object, defaults to the default object.

    Returns:
        A list of extraction results in the same order as the input contents.
    """
    results = cast("list[ExtractionResult]", [None] * len(contents))

    async def _extract_bytes(content: bytes, mime_type: str, index: int) -> None:
        result = await extract_bytes(content, mime_type, config)
        results[index] = result

    async with anyio.create_task_group() as tg:
        for i, (content, mime_type) in enumerate(contents):
            tg.start_soon(_extract_bytes, content, mime_type, i)

    return results

Synchronous Functions

These functions block until extraction is complete and are suitable for non-async contexts.

extract_file_sync

Synchronous version of extract_file:

kreuzberg.extract_file_sync(file_path: Path | str, mime_type: str | None = None, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult

Synchronous version of extract_file.

PARAMETER DESCRIPTION
file_path

The path to the file.

TYPE: Path | str

mime_type

The mime type of the content.

TYPE: str | None DEFAULT: None

config

Extraction options object, defaults to the default object.

TYPE: ExtractionConfig DEFAULT: DEFAULT_CONFIG

RETURNS DESCRIPTION
ExtractionResult

The extracted content and the mime type of the content.

Source code in kreuzberg/extraction.py
def extract_file_sync(
    file_path: Path | str, mime_type: str | None = None, config: ExtractionConfig = DEFAULT_CONFIG
) -> ExtractionResult:
    """Synchronous version of extract_file.

    Args:
        file_path: The path to the file.
        mime_type: The mime type of the content.
        config: Extraction options object, defaults to the default object.

    Returns:
        The extracted content and the mime type of the content.
    """
    mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
    if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
        result = extractor.extract_path_sync(Path(file_path))
    else:
        result = ExtractionResult(
            content=Path(file_path).read_text(),
            chunks=[],
            mime_type=mime_type,
            metadata={},
        )
    return _validate_and_post_process_sync(result=result, config=config)

extract_bytes_sync

Synchronous version of extract_bytes:

kreuzberg.extract_bytes_sync(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult

Synchronous version of extract_bytes.

PARAMETER DESCRIPTION
content

The content to extract.

TYPE: bytes

mime_type

The mime type of the content.

TYPE: str

config

Extraction options object, defaults to the default object.

TYPE: ExtractionConfig DEFAULT: DEFAULT_CONFIG

RETURNS DESCRIPTION
ExtractionResult

The extracted content and the mime type of the content.

Source code in kreuzberg/extraction.py
def extract_bytes_sync(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult:
    """Synchronous version of extract_bytes.

    Args:
        content: The content to extract.
        mime_type: The mime type of the content.
        config: Extraction options object, defaults to the default object.

    Returns:
        The extracted content and the mime type of the content.
    """
    mime_type = validate_mime_type(mime_type=mime_type)
    if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
        result = extractor.extract_bytes_sync(content)
    else:
        result = ExtractionResult(
            content=safe_decode(content),
            chunks=[],
            mime_type=mime_type,
            metadata={},
        )

    return _validate_and_post_process_sync(result=result, config=config)

batch_extract_file_sync

Synchronous version of batch_extract_file:

kreuzberg.batch_extract_file_sync(file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG) -> list[ExtractionResult]

Synchronous version of batch_extract_file.

PARAMETER DESCRIPTION
file_paths

A sequence of paths to files to extract text from.

TYPE: Sequence[PathLike[str] | str]

config

Extraction options object, defaults to the default object.

TYPE: ExtractionConfig DEFAULT: DEFAULT_CONFIG

RETURNS DESCRIPTION
list[ExtractionResult]

A list of extraction results in the same order as the input paths.

Source code in kreuzberg/extraction.py
def batch_extract_file_sync(
    file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG
) -> list[ExtractionResult]:
    """Synchronous version of batch_extract_file.

    Args:
        file_paths: A sequence of paths to files to extract text from.
        config: Extraction options object, defaults to the default object.

    Returns:
        A list of extraction results in the same order as the input paths.
    """
    return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]

batch_extract_bytes_sync

Synchronous version of batch_extract_bytes:

kreuzberg.batch_extract_bytes_sync(contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG) -> list[ExtractionResult]

Synchronous version of batch_extract_bytes.

PARAMETER DESCRIPTION
contents

A sequence of tuples containing (content, mime_type) pairs.

TYPE: Sequence[tuple[bytes, str]]

config

Extraction options object, defaults to the default object.

TYPE: ExtractionConfig DEFAULT: DEFAULT_CONFIG

RETURNS DESCRIPTION
list[ExtractionResult]

A list of extraction results in the same order as the input contents.

Source code in kreuzberg/extraction.py
def batch_extract_bytes_sync(
    contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG
) -> list[ExtractionResult]:
    """Synchronous version of batch_extract_bytes.

    Args:
        contents: A sequence of tuples containing (content, mime_type) pairs.
        config: Extraction options object, defaults to the default object.

    Returns:
        A list of extraction results in the same order as the input contents.
    """
    return [extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents]