Manages extractors for different MIME types and their configurations.
This class provides functionality to register, unregister, and retrieve extractors based on MIME types. It supports both synchronous and asynchronous operations for managing extractors. A default set of extractors is also maintained alongside user-registered extractors.
Source code in kreuzberg/_registry.py
| class ExtractorRegistry:
"""Manages extractors for different MIME types and their configurations.
This class provides functionality to register, unregister, and retrieve
extractors based on MIME types. It supports both synchronous and asynchronous
operations for managing extractors. A default set of extractors is also
maintained alongside user-registered extractors.
"""
_default_extractors: ClassVar[list[type[Extractor]]] = [
PDFExtractor,
OfficeDocumentExtractor,
PresentationExtractor,
SpreadSheetExtractor,
HTMLExtractor,
MarkdownExtractor,
ImageExtractor,
BibliographyExtractor,
EbookExtractor,
LaTeXExtractor,
MiscFormatExtractor,
StructuredTextExtractor,
TabularDataExtractor,
XMLBasedExtractor,
]
_registered_extractors: ClassVar[list[type[Extractor]]] = []
@classmethod
@lru_cache
def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
"""Gets the extractor for the mimetype.
Args:
mime_type: The mime type of the content.
config: Extraction options object, defaults to the default object.
Returns:
The extractor
"""
extractors: list[type[Extractor]] = [
*cls._registered_extractors,
*cls._default_extractors,
]
if mime_type:
for extractor in extractors:
if extractor.supports_mimetype(mime_type):
return extractor(mime_type=mime_type, config=config)
return None
@classmethod
def add_extractor(cls, extractor: type[Extractor]) -> None:
"""Add an extractor to the registry.
Note:
Extractors are tried in the order they are added: first added, first tried.
Args:
extractor: The extractor to add.
Returns:
None
"""
cls._registered_extractors.append(extractor)
cls.get_extractor.cache_clear()
@classmethod
def remove_extractor(cls, extractor: type[Extractor]) -> None:
"""Remove an extractor from the registry.
Args:
extractor: The extractor to remove.
Returns:
None
"""
try:
cls._registered_extractors.remove(extractor)
cls.get_extractor.cache_clear()
except ValueError:
pass
|
Add an extractor to the registry.
Note
Extractors are tried in the order they are added: first added, first tried.
PARAMETER | DESCRIPTION |
extractor | TYPE: type[Extractor] |
Source code in kreuzberg/_registry.py
| @classmethod
def add_extractor(cls, extractor: type[Extractor]) -> None:
"""Add an extractor to the registry.
Note:
Extractors are tried in the order they are added: first added, first tried.
Args:
extractor: The extractor to add.
Returns:
None
"""
cls._registered_extractors.append(extractor)
cls.get_extractor.cache_clear()
|
Gets the extractor for the mimetype.
PARAMETER | DESCRIPTION |
mime_type | The mime type of the content. TYPE: str | None |
config | Extraction options object, defaults to the default object. TYPE: ExtractionConfig |
RETURNS | DESCRIPTION |
Extractor | None | |
Source code in kreuzberg/_registry.py
| @classmethod
@lru_cache
def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
"""Gets the extractor for the mimetype.
Args:
mime_type: The mime type of the content.
config: Extraction options object, defaults to the default object.
Returns:
The extractor
"""
extractors: list[type[Extractor]] = [
*cls._registered_extractors,
*cls._default_extractors,
]
if mime_type:
for extractor in extractors:
if extractor.supports_mimetype(mime_type):
return extractor(mime_type=mime_type, config=config)
return None
|
Remove an extractor from the registry.
PARAMETER | DESCRIPTION |
extractor | TYPE: type[Extractor] |
Source code in kreuzberg/_registry.py
| @classmethod
def remove_extractor(cls, extractor: type[Extractor]) -> None:
"""Remove an extractor from the registry.
Args:
extractor: The extractor to remove.
Returns:
None
"""
try:
cls._registered_extractors.remove(extractor)
cls.get_extractor.cache_clear()
except ValueError:
pass
|