diff --git a/docs/latest/components/preprocessing.mdx b/docs/latest/components/preprocessing.mdx
index 4043b33f6..3c0c4acd1 100644
--- a/docs/latest/components/preprocessing.mdx
+++ b/docs/latest/components/preprocessing.mdx
@@ -50,6 +50,17 @@ Please refer to [the API docs](/reference/file-converters) to see which converte
valid_languages=["de","en"])
doc = converter.convert(file_path=file, meta=None)
+
+ # Alternatively, if you have a PDF containing images, Haystack uses tessaract under the hood to OCR image PDFs.
+
+
+ from haystack.file_converter import PDFToTextOCRConverter
+
+
+ converter = PDFToTextOCRConverter(remove_numeric_tables=False,
+ valid_languages=["deu","eng"])
+
+ doc = converter.convert(file_path=file, meta=None)
),
},
@@ -71,7 +82,7 @@ Please refer to [the API docs](/reference/file-converters) to see which converte
content: (
- Haystack also has a`convert_files_to_dicts()` utility function that + Haystack also has a `convert_files_to_dicts()` utility function that will convert all txt or pdf files in a given folder into this dictionary format.
@@ -84,6 +95,26 @@ Please refer to [the API docs](/reference/file-converters) to see which converte+ Haystack supports extraction of text from images using OCR. +
++++ from haystack.file_converter import ImageToTextConverter +
++ converter = ImageToTextConverter(remove_numeric_tables=True, + valid_languages=["de","en"]) +
+doc = converter.convert(file_path=file, meta=None)
+