Merge pull request #175 from deepset-ai/transformers-document-classifier

Add transformers document classifier
deepset-ai · Oct 1, 2021 · 2759c9e · 2759c9e · vercel · Oct 1, 2021
2 parents 3380d12 + 5e90cc4
commit 2759c9e
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 48 deletions.
diff --git a/docs/latest/components/classifier.mdx b/docs/latest/components/classifier.mdx
diff --git a/docs/latest/components/document_classifier.mdx b/docs/latest/components/document_classifier.mdx
@@ -0,0 +1,53 @@
+# Document Classifier
+
+The TransformersDocumentClassifier Node is a transformer based classification model used to create predictions that can be attached to retrieved documents as metadata.
+For example, by using a sentiment model, you can label each document as being either positive or negative in sentiment.
+Through a tight integration with the HuggingFace model hub, you can easily load any classification model by simply supplying the model name.
+
+![image](/img/classifier.png)
+
+<div className="max-w-xl bg-yellow-light-theme border-l-8 border-yellow-dark-theme px-6 pt-6 pb-4 my-4 rounded-md dark:bg-yellow-900">
+
+Note that the Document Classifier is different from the Query Classifier.
+While the Query Classifier categorizes incoming queries in order to route them to different parts of the pipeline,
+the Document Classifier is used to create classification labels that can be attached to retrieved documents as metadata.
+
+</div>
+
+## Usage
+
+Initialize it as follows:
+
+``` python
+from haystack.document_classifier import TransformersDocumentClassifier
+
+doc_classifier_model = 'bhadresh-savani/distilbert-base-uncased-emotion'
+doc_classifier = TransformersDocumentClassifier(model_name_or_path=doc_classifier_model)
+```
+
+Alternatively, if you can't find a classification model that has been pre-trained for your exact classification task, you can use zero-shot classification with a custom list of labels and a Natural language Inference (NLI) model as follows:
+
+``` python
+doc_classifier_model = 'cross-encoder/nli-distilroberta-base'
+doc_classifier = TransformersDocumentClassifier(
+ model_name_or_path=doc_classifier_model,
+ task="zero-shot-classification",
+ labels=["negative", "positive"]
+```
+
+
+It is slotted into a pipeline as follows:
+
+``` python
+pipeline = Pipeline()
+pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
+pipeline.add_node(component=doc_classifier, name='DocClassifier', inputs=['Retriever'])
+```
+
+It can also be run in isolation:
+
+``` python
+documents = doc_classifier.predict(
+ documents = [doc1, doc2, doc3, ...]
+):
+```
diff --git a/docs/latest/menu.json b/docs/latest/menu.json
@@ -30,7 +30,8 @@
  {"slug": "knowledge-graph", "title": "Knowledge Graph"},
  {"slug": "ranker", "title": "Ranker"},
  {"slug": "query-classifier", "title": "Query Classifier"},
- {"slug": "question-generator", "title": "Question Generator"} ]
+ {"slug": "question-generator", "title": "Question Generator"},
+ {"slug": "document-classifier", "title": "Document Classifier"}]
  },
  {
  "subMenuTitle": "Guides",
@@ -122,7 +123,8 @@
  {"slug": "knowledge-graph", "title": "Knowledge Graph"},
  {"slug": "graph-retriever", "title": "Graph Retriever"},
  {"slug": "question-generator", "title": "Question Generator"},
- {"slug": "ranker", "title": "Ranker"}
+ {"slug": "ranker", "title": "Ranker"},
+ {"slug": "document-classifier", "title": "Document Classifier"}
  ]
  }
 ]
diff --git a/lib/constants.ts b/lib/constants.ts
@@ -81,9 +81,9 @@ export const referenceFiles: Meta = {
  title: "Question Generator",
  },
  {
- slug: "classifier",
- filename: "classifier.md",
- title: "Classifier",
+ slug: "document-classifier",
+ filename: "document_classifier.md",
+ title: "Document Classifier",
  },
  {
  slug: "ranker",