fromkreuzbergimportextract_file,ExtractionConfig,ValidationError,ExtractionResult# Define a validation hookdefvalidate_content_length(result:ExtractionResult)->None:"""Validate that the extracted content has a minimum length."""iflen(result.content)<10:raiseValidationError("Extracted content is too short (less than 10 characters)")# Use the validation hookasyncdefextract_with_validation():config=ExtractionConfig(validators=[validate_content_length])result=awaitextract_file("document.pdf",config=config)returnresult
fromkreuzbergimportextract_file,ExtractionConfig,ExtractionResult# Define a post-processing hookdefclean_whitespace(result:ExtractionResult)->ExtractionResult:"""Clean up excessive whitespace in the extracted text."""importre# Replace multiple spaces with a single spacecleaned_content=re.sub(r"\s+"," ",result.content)# Replace multiple newlines with a single newlinecleaned_content=re.sub(r"\n+","\n",cleaned_content)# Create a new result with the cleaned contentreturnExtractionResult(content=cleaned_content,mime_type=result.mime_type,metadata=result.metadata)# Use the post-processing hookasyncdefextract_with_post_processing():config=ExtractionConfig(post_processing_hooks=[clean_whitespace])result=awaitextract_file("document.pdf",config=config)returnresult
fromkreuzbergimportextract_file,ExtractionConfig,ExtractionResult,ValidationError# Define validation hooksdefvalidate_content_length(result:ExtractionResult)->None:iflen(result.content)<10:raiseValidationError("Extracted content is too short")defvalidate_has_text(result:ExtractionResult)->None:ifnotresult.content.strip():raiseValidationError("Extracted content is empty or contains only whitespace")# Define post-processing hooksdefclean_whitespace(result:ExtractionResult)->ExtractionResult:importrecleaned_content=re.sub(r"\s+"," ",result.content)cleaned_content=re.sub(r"\n+","\n",cleaned_content)returnExtractionResult(content=cleaned_content,mime_type=result.mime_type,metadata=result.metadata)defnormalize_text(result:ExtractionResult)->ExtractionResult:"""Normalize text by converting to lowercase and removing special characters."""importre# Convert to lowercasenormalized=result.content.lower()# Remove special charactersnormalized=re.sub(r"[^\w\s]","",normalized)returnExtractionResult(content=normalized,mime_type=result.mime_type,metadata=result.metadata)# Use multiple hooksasyncdefextract_with_multiple_hooks():config=ExtractionConfig(validators=[validate_content_length,validate_has_text],post_processing_hooks=[clean_whitespace,normalize_text])result=awaitextract_file("document.pdf",config=config)returnresult
fromkreuzbergimportextract_file,ExtractionConfig,ExtractionResultdefdetect_language(result:ExtractionResult)->ExtractionResult:"""Detect the language of the extracted text and add it to metadata."""try:# You need to install langdetect: pip install langdetectfromlangdetectimportdetect# Only detect if we have enough textiflen(result.content)>50:language=detect(result.content)# Create updated metadata with language informationupdated_metadata=dict(result.metadata)updated_metadata["detected_language"]=languagereturnExtractionResult(content=result.content,mime_type=result.mime_type,metadata=updated_metadata)exceptException:# If language detection fails, return the original resultpassreturnresultasyncdefextract_with_language_detection():config=ExtractionConfig(post_processing_hooks=[detect_language])result=awaitextract_file("document.pdf",config=config)if"detected_language"inresult.metadata:print(f"Detected language: {result.metadata['detected_language']}")returnresult