diff --git a/docs/latest/components/preprocessing.mdx b/docs/latest/components/preprocessing.mdx index 4043b33f6..3c0c4acd1 100644 --- a/docs/latest/components/preprocessing.mdx +++ b/docs/latest/components/preprocessing.mdx @@ -50,6 +50,17 @@ Please refer to [the API docs](/reference/file-converters) to see which converte valid_languages=["de","en"]) doc = converter.convert(file_path=file, meta=None) + + # Alternatively, if you have a PDF containing images, Haystack uses tessaract under the hood to OCR image PDFs. + + + from haystack.file_converter import PDFToTextOCRConverter + + + converter = PDFToTextOCRConverter(remove_numeric_tables=False, + valid_languages=["deu","eng"]) + + doc = converter.convert(file_path=file, meta=None) ), }, @@ -71,7 +82,7 @@ Please refer to [the API docs](/reference/file-converters) to see which converte content: (

- Haystack also has a`convert_files_to_dicts()` utility function that + Haystack also has a `convert_files_to_dicts()` utility function that will convert all txt or pdf files in a given folder into this dictionary format.

@@ -84,6 +95,26 @@ Please refer to [the API docs](/reference/file-converters) to see which converte
), }, + { + title: "Image", + content: ( +
+

+ Haystack supports extraction of text from images using OCR. +

+
+            
+              from haystack.file_converter import ImageToTextConverter
+            
+            
+            converter = ImageToTextConverter(remove_numeric_tables=True,
+            valid_languages=["de","en"])
+          
+          doc = converter.convert(file_path=file, meta=None)
+          
+
+ ), + }, ]} />