Hi Team,
I’ve been working with LiveKit Vision and LiveKit Virtual Avatars (Tavus). While testing with the Gemini Live model, we noticed that the lip sync is not accurate. Currently, both STT and TTS are handled by Gemini Live models, but our requirement is slightly different.
We would like GPT to handle the STT and TTS, while using the Gemini Live model solely as the LLM to process video frames and provide responses. Is this configuration possible?
Code:
from dotenv import load_dotenv
import logging
import os
from livekit.agents import (
AgentSession,
RoomInputOptions,
AutoSubscribe,
WorkerOptions,
cli,
)
from livekit.agents import Agent
from livekit.agents import metrics
from livekit.plugins import openai, noise_cancellation, silero
from livekit.plugins import tavus
from livekit.plugins import google
load_dotenv()
logger = logging.getLogger("vision-agent")
logger.setLevel(
logging.INFO)
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_deployment = os.getenv("DEPLOYMENT_NAME")
api_key = os.getenv("AZURE_OPENAI_API_KEY")
api_version = os.getenv("OPENAI_API_VERSION")
azure_endpoint_2 = os.getenv("AZURE_OPENAI_ENDPOINT_2")
azure_api_key_2 = os.getenv("AZURE_OPENAI_API_KEY_2")
class VisionAgent(Agent):
def __init__(self):
super().__init__(
instructions=(
"You are a helpful voice assistant with live video input from your user. If the user speaks in English, respond in English. If the user speaks in Tamil or Hindi, respond in the same language. "
)
)
async def entrypoint(ctx):
await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
session = AgentSession(
vad=silero.VAD.load(),
stt=openai.STT.with_azure(
model="gpt-4o-transcribe",
azure_endpoint=azure_endpoint,
api_key=api_key,
api_version=api_version,
detect_language=True,
),
llm=google.beta.realtime.RealtimeModel(
voice="Kore",
temperature=0.8,
),
tts=openai.TTS.with_azure(
model="gpt-4o-mini-tts",
voice="alloy",
azure_endpoint=azure_endpoint_2,
api_key=azure_api_key_2,
),
)
usage_collector = metrics.UsageCollector()
session.on("metrics_collected", lambda ev: usage_collector.collect(ev.metrics))
assistant = VisionAgent()
avatar = tavus.AvatarSession(
replica_id="rc2146c13e81",
persona_id="pdced222244b",
)
await avatar.start(session, room=ctx.room)
print("Avatar session started successfully")
await session.start(
agent=assistant,
room=ctx.room,
room_input_options=RoomInputOptions(
noise_cancellation=noise_cancellation.BVC(),
video_enabled=True,
close_on_disconnect=False
),
)
async def log_usage():
summary = usage_collector.get_summary()
print("================================= ", summary)
logger.info(f"Usage summary: {summary}")
ctx.add_shutdown_callback(log_usage)
if
name == "__main__":
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))