Document new converters

ju-gu · Sep 20, 2021 · 98f0012 · 98f0012
1 parent 680c616
commit 98f0012
Showing 1 changed file with 32 additions and 1 deletion.
diff --git a/docs/latest/components/preprocessing.mdx b/docs/latest/components/preprocessing.mdx
@@ -50,6 +50,17 @@ Please refer to [the API docs](/reference/file-converters) to see which converte
  valid_languages=["de","en"])
  </code>
  <code>doc = converter.convert(file_path=file, meta=None)</code>
+ <code>
+ # Alternatively, if you have a PDF containing images, Haystack uses tessaract under the hood to OCR image PDFs.
+ </code>
+ <code>
+ from haystack.file_converter import PDFToTextOCRConverter
+ </code>
+ <code>
+ converter = PDFToTextOCRConverter(remove_numeric_tables=False,
+ valid_languages=["deu","eng"])
+ </code>
+ <code>doc = converter.convert(file_path=file, meta=None)</code>
  </pre>
  ),
  },
@@ -71,7 +82,7 @@ Please refer to [the API docs](/reference/file-converters) to see which converte
  content: (
  <div>
  <p>
- Haystack also has a`convert_files_to_dicts()` utility function that
+ Haystack also has a `convert_files_to_dicts()` utility function that
  will convert all txt or pdf files in a given folder into this
  dictionary format.
  </p>
@@ -84,6 +95,26 @@ Please refer to [the API docs](/reference/file-converters) to see which converte
  </div>
  ),
  },
+ {
+ title: "Image",
+ content: (
+ <div>
+ <p>
+ Haystack supports extraction of text from images using OCR.
+ </p>
+ <pre>
+ <code>
+ from haystack.file_converter import ImageToTextConverter
+ </code>
+ <code>
+ converter = ImageToTextConverter(remove_numeric_tables=True,
+ valid_languages=["de","en"])
+ </code>
+ <code>doc = converter.convert(file_path=file, meta=None)</code>
+ </pre>
+ </div>
+ ),
+ },
  ]}
 />