Sean
08/31/2023, 6:59 PMv1.1.4
and OpenLLM v0.2.27
. See an example service definition for SSE streaming with Llama2.
β’ Added response streaming through SSE to the bentoml.io.Text
IO Descriptor type.
β’ Added async generator support to both API Server and Runner to yield
incremental text responses.
β’ Added supported to βοΈ BentoCloud to natively support SSE streaming.
π¦Ύ OpenLLM added token streaming capabilities to support streaming responses from LLMs.
β’ Added /v1/generate_stream
endpoint for streaming responses from LLMs.
curl -N -X 'POST' '<http://0.0.0.0:3000/v1/generate_stream>' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{
"prompt": "### Instruction:\n What is the definition of time (200 words essay)?\n\n### Response:",
"llm_config": {
"use_llama2_prompt": false,
"max_new_tokens": 4096,
"early_stopping": false,
"num_beams": 1,
"num_beam_groups": 1,
"use_cache": true,
"temperature": 0.89,
"top_k": 50,
"top_p": 0.76,
"typical_p": 1,
"epsilon_cutoff": 0,
"eta_cutoff": 0,
"diversity_penalty": 0,
"repetition_penalty": 1,
"encoder_repetition_penalty": 1,
"length_penalty": 1,
"no_repeat_ngram_size": 0,
"renormalize_logits": false,
"remove_invalid_values": false,
"num_return_sequences": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_scores": false,
"encoder_no_repeat_ngram_size": 0,
"n": 1,
"best_of": 1,
"presence_penalty": 0.5,
"frequency_penalty": 0,
"use_beam_search": false,
"ignore_eos": false
},
"adapter_name": null
}'