importasynciofromkreuzbergimportextract_fileasyncdefmain():# Extract text from a PDF fileresult=awaitextract_file("document.pdf")print(result.content)# The result also contains metadataprint(f"Mime type: {result.mime_type}")print(f"Extraction method: {result.extraction_method}")asyncio.run(main())
fromkreuzbergimportextract_file,ExtractionConfig,TesseractConfig,PSMModeasyncdefmain():# Extract text from an image with German language modelresult=awaitextract_file("german_document.jpg",config=ExtractionConfig(ocr_config=TesseractConfig(language="deu",psm=PSMMode.SINGLE_BLOCK# German language model # Treat as a single text block)),)print(result.content)asyncio.run(main())
frompathlibimportPathfromkreuzbergimportbatch_extract_fileasyncdefprocess_documents():file_paths=[Path("document1.pdf"),Path("document2.docx"),Path("image.jpg")]# Process all files concurrentlyresults=awaitbatch_extract_file(file_paths)# Results are returned in the same order as inputsforpath,resultinzip(file_paths,results):print(f"File: {path}")print(f"Content: {result.content[:100]}...")# First 100 charsprint(f"Mime type: {result.mime_type}")print(f"Method: {result.extraction_method}")print("---")asyncio.run(process_documents())