Skip to content

Extractor Registry

The ExtractorRegistry manages document extractors and allows custom extractor registration.

kreuzberg.ExtractorRegistry

Manages extractors for different MIME types and their configurations.

This class provides functionality to register, unregister, and retrieve extractors based on MIME types. It supports both synchronous and asynchronous operations for managing extractors. A default set of extractors is also maintained alongside user-registered extractors.

Source code in kreuzberg/_registry.py
class ExtractorRegistry:
    """Manages extractors for different MIME types and their configurations.

    This class provides functionality to register, unregister, and retrieve
    extractors based on MIME types. It supports both synchronous and asynchronous
    operations for managing extractors. A default set of extractors is also
    maintained alongside user-registered extractors.
    """

    _default_extractors: ClassVar[list[type[Extractor]]] = [
        PDFExtractor,
        OfficeDocumentExtractor,
        PresentationExtractor,
        SpreadSheetExtractor,
        HTMLExtractor,
        MarkdownExtractor,
        ImageExtractor,
        BibliographyExtractor,
        EbookExtractor,
        LaTeXExtractor,
        MiscFormatExtractor,
        StructuredTextExtractor,
        TabularDataExtractor,
        XMLBasedExtractor,
    ]
    _registered_extractors: ClassVar[list[type[Extractor]]] = []

    @classmethod
    @lru_cache
    def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
        """Gets the extractor for the mimetype.

        Args:
            mime_type: The mime type of the content.
            config: Extraction options object, defaults to the default object.

        Returns:
            The extractor
        """
        extractors: list[type[Extractor]] = [
            *cls._registered_extractors,
            *cls._default_extractors,
        ]
        if mime_type:
            for extractor in extractors:
                if extractor.supports_mimetype(mime_type):
                    return extractor(mime_type=mime_type, config=config)

        return None

    @classmethod
    def add_extractor(cls, extractor: type[Extractor]) -> None:
        """Add an extractor to the registry.

        Note:
            Extractors are tried in the order they are added: first added, first tried.

        Args:
            extractor: The extractor to add.

        Returns:
            None
        """
        cls._registered_extractors.append(extractor)
        cls.get_extractor.cache_clear()

    @classmethod
    def remove_extractor(cls, extractor: type[Extractor]) -> None:
        """Remove an extractor from the registry.

        Args:
            extractor: The extractor to remove.

        Returns:
            None
        """
        try:
            cls._registered_extractors.remove(extractor)
            cls.get_extractor.cache_clear()
        except ValueError:
            pass

Functions

add_extractor(extractor: type[Extractor]) -> None classmethod

Add an extractor to the registry.

Note

Extractors are tried in the order they are added: first added, first tried.

PARAMETER DESCRIPTION
extractor

The extractor to add.

TYPE: type[Extractor]

RETURNS DESCRIPTION
None

None

Source code in kreuzberg/_registry.py
@classmethod
def add_extractor(cls, extractor: type[Extractor]) -> None:
    """Add an extractor to the registry.

    Note:
        Extractors are tried in the order they are added: first added, first tried.

    Args:
        extractor: The extractor to add.

    Returns:
        None
    """
    cls._registered_extractors.append(extractor)
    cls.get_extractor.cache_clear()

get_extractor(mime_type: str | None, config: ExtractionConfig) -> Extractor | None cached classmethod

Gets the extractor for the mimetype.

PARAMETER DESCRIPTION
mime_type

The mime type of the content.

TYPE: str | None

config

Extraction options object, defaults to the default object.

TYPE: ExtractionConfig

RETURNS DESCRIPTION
Extractor | None

The extractor

Source code in kreuzberg/_registry.py
@classmethod
@lru_cache
def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
    """Gets the extractor for the mimetype.

    Args:
        mime_type: The mime type of the content.
        config: Extraction options object, defaults to the default object.

    Returns:
        The extractor
    """
    extractors: list[type[Extractor]] = [
        *cls._registered_extractors,
        *cls._default_extractors,
    ]
    if mime_type:
        for extractor in extractors:
            if extractor.supports_mimetype(mime_type):
                return extractor(mime_type=mime_type, config=config)

    return None

remove_extractor(extractor: type[Extractor]) -> None classmethod

Remove an extractor from the registry.

PARAMETER DESCRIPTION
extractor

The extractor to remove.

TYPE: type[Extractor]

RETURNS DESCRIPTION
None

None

Source code in kreuzberg/_registry.py
@classmethod
def remove_extractor(cls, extractor: type[Extractor]) -> None:
    """Remove an extractor from the registry.

    Args:
        extractor: The extractor to remove.

    Returns:
        None
    """
    try:
        cls._registered_extractors.remove(extractor)
        cls.get_extractor.cache_clear()
    except ValueError:
        pass