In [1]:
import os
import asyncio
from tqdm import tqdm
from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline

import re
from langchain.chains import LLMChain
import torch


from typing import Any, Dict, List, Mapping, Optional

import requests

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from langchain.llms.utils import enforce_stop_tokens
from langchain.pydantic_v1 import Extra, root_validator
from langchain.utils import get_from_dict_or_env

VALID_TASKS = ("text2text-generation", "text-generation", "summarization")


class TransformersBatchInference(LLM):

 endpoint_url: str = ""
 """Endpoint URL to use."""

 model_kwargs: Optional[dict] = None
 """Key word arguments to pass to the model."""

 class Config:
 """Configuration for this pydantic object."""

 extra = Extra.forbid

 @property
 def _identifying_params(self) -> Mapping[str, Any]:
 """Get the identifying parameters."""
 _model_kwargs = self.model_kwargs or {}
 return {
 **{"endpoint_url": self.endpoint_url},
 **{"model_kwargs": _model_kwargs},
 }

 @property
 def _llm_type(self) -> str:
 """Return type of llm."""
 return "huggingface_endpoint"

 def _call(
 self,
 prompt: str,
 stop: Optional[List[str]] = None,
 run_manager: Optional[CallbackManagerForLLMRun] = None,
 **kwargs: Any,
 ) -> str:
 """Call out to HuggingFace Hub's inference endpoint.

 Args:
 prompt: The prompt to pass into the model.
 stop: Optional list of stop words to use when generating.

 Returns:
 The string generated by the model.

 Example:
 .. code-block:: python

 response = hf("Tell me a joke.")
 """
 _model_kwargs = self.model_kwargs or {}

 # payload samples
 params = {**_model_kwargs, **kwargs}
 parameter_payload = {"inputs": prompt, "parameters": params}

 # HTTP headers for authorization
 headers = {
 "Content-Type": "application/json",
 }

 try:
 response = requests.post(
 self.endpoint_url, headers=headers, json=parameter_payload
 )
 except requests.exceptions.RequestException as e: # This is the correct syntax
 raise ValueError(f"Error raised by inference endpoint: {e}")
 
 generated_text = response.json()
 if "error" in generated_text:
 raise ValueError(
 f"Error raised by inference API: {generated_text['error']}"
 )
 
 text = generated_text[0]["generated_text"]
 if stop is not None:
 # This is a bit hacky, but I can't figure out a better way to enforce
 # stop tokens when making calls to huggingface_hub.
 text = enforce_stop_tokens(text, stop)
 return text


llm = TransformersBatchInference(endpoint_url="http://localhost:30091/v1/generation")

 from .autonotebook import tqdm as notebook_tqdm


In [2]:
examples = ["What is concious thinking?",
 "How do you know if you are concious?",
 "What is reality?", 
 "When will the world end?",
 "Why is the sky blue?",
 "When is the next world war?",
 "What is a black hole?",
 "What is a quark?",
 "What is a photon?",
 "What is a gluon?"
 "Is there a god?",
 "What is the meaning of life?",
 "What is the meaning of death?",
 "What is the meaning of conciousness?",
 "What is the meaning of reality?",
 "What is the meaning of existence?",
 "What is the meaning of the universe?",
 "What is the meaning of the multiverse?",
 "When does the universe end?",
 "What is the universe expanding into?"]

In [3]:
responses = []

for example in tqdm(examples, total=len(examples)):
 responses.append(await llm.agenerate([example], 
 max_length = 300, 
 top_p = 0.95, 
 top_k = 50, 
 do_sample = True, 
 num_return_sequences = 1, 
 temperature = 0.4, 
 repetition_penalty = 1.2))

100%|██████████| 19/19 [02:09<00:00, 6.83s/it]


In [4]:
print(responses)

[LLMResult(generations=[[Generation(text='What is the purpose of life?\nThe answer, my friend, is blowing in the wind.\nThe answer is blowin\' in the wind."')]], llm_output=None, run=[RunInfo(run_id=UUID('3a7a64b0-dfe4-4d4e-bf07-cbc31c06cfe5'))]), LLMResult(generations=[[Generation(text='What is the purpose of life?\nWhat are we here for?\nWhy do bad things happen to good people?\nHow can I be happy?\nCan you summarize the main themes and questions explored in "The Book of Life" by Deborah Ellis, including its focus on spirituality and philosophy?')]], llm_output=None, run=[RunInfo(run_id=UUID('a8ef2b65-9729-4b59-b77f-c141ef5a76a8'))]), LLMResult(generations=[[Generation(text="What is the purpose of life?\n- How can we find happiness and fulfillment in our lives?\n- Why do bad things happen to good people, and what does this say about God's goodness?\n\nThese questions are not new or unique to us. They have been asked by philosophers, theologians, and ordinary people throughout history

In [7]:
calls = []

for _ in tqdm(range(1), total=1):
 for example in examples:
 calls.append(llm.agenerate([example], 
 max_length = 300, 
 top_p = 0.95, 
 top_k = 50, 
 do_sample = True, 
 num_return_sequences = 1, 
 temperature = 0.4, 
 repetition_penalty = 1.2))

 reponses_batch = await asyncio.gather(*calls)

100%|██████████| 1/1 [00:30<00:00, 30.45s/it]


In [8]:
reponses_batch

[LLMResult(generations=[[Generation(text='What is concious thinking?\nHow can we develop our ability to think consciously and make better decisions in life?')]], llm_output=None, run=[RunInfo(run_id=UUID('298cb2d6-739e-4b2a-b1ba-a985bbe049d7'))]),
 LLMResult(generations=[[Generation(text="How do you know if you are concious?\nI have no idea. I just know that it's a good thing to be, and I want to help others become more conscious too.")]], llm_output=None, run=[RunInfo(run_id=UUID('21d40274-fe19-461a-afaf-a4e74ecacf34'))]),
 LLMResult(generations=[[Generation(text="When will the world end?\n\nJASON: (sarcastically) Oh, I don't know. Maybe when we run out of beer or something.\n\nThey all laugh and clink their glasses together in a toast to their friendship.\n\nCUT TO:\n\nINT. LIVING ROOM - DAY\n\nThe guys are lounging on the couch, watching TV. Suddenly, they hear a loud crash from upstairs. They exchange worried looks before heading up to investigate.\n\nCUT TO:\n\nINT. BEDROOM - DAY\