this is my current set up
if is_english_language:
selected_turn_detector = EnglishModel()
turn_detector_name = “EnglishModel”
logger.info(f”🎯 Turn detector: EnglishModel selected for English”)
elif turn_detector_supported:
selected_turn_detector = MultilingualModel()
turn_detector_name = “MultilingualModel”
logger.info(f”🎯 Turn detector: MultilingualModel selected - {language_name_safe} is supported”)
else:
selected_turn_detector = “vad”
turn_detector_name = “VAD”
logger.warning(f”⚠️ Turn detector: Falling back to VAD-only detection for {language_name_safe} (language not supported by MultilingualModel)“)
# For turn detector to work properly with MultilingualModel, we should use “multi”
# to let Deepgram auto-detect language and pass it to the turn detector
if not is_english_language and turn_detector_supported:
logger.info(f”🔧 Using Deepgram ‘multi’ mode for language detection with turn detector support”)
stt_language_code_for_deepgram = “multi” # Let Deepgram auto-detect and pass language to turn detector
else:
stt_language_code_for_deepgram = stt_language_code or “multi”
if not stt_language_code:
logger.error(f”❌ Deepgram language code not found for ‘{language_name_safe}’. Using ‘multi’ as fallback.“)
stt_language_code = “multi”
stt_language_code_for_deepgram = “multi”
openai_language_code = “en” if is_english_language else None
openai_detect_language = openai_language_code is None
logger.info(f”🔧 Language Configuration:“)
logger.info(f” • Input language: ‘{languages.language_name}’ -> normalized: ‘{language_name_safe}‘“)
logger.info(f” • Deepgram STT code: ‘{stt_language_code}’ -> using: ‘{stt_language_code_for_deepgram}‘“)
logger.info(f” • OpenAI STT: lang={openai_language_code}, auto_detect={openai_detect_language}“)
logger.info(f” • Turn detector supported: {turn_detector_supported}“)
logger.info(f”🤖 Starting agent session with models:“)
logger.info(f” • STT: Deepgram nova-3-general (lang: {stt_language_code_for_deepgram}), OpenAI gpt-4o-transcribe, AssemblyAI”)
logger.info(f” • LLM: Anthropic claude-3-5-sonnet-20241022, Google gemini-2.5-flash”)
logger.info(f” • TTS: {voices.provider} {voices.model_name}, OpenAI gpt-4o-mini-tts, Cartesia sonic-2")
# Log final STT configuration before creating agent
logger.info(f”🚀 Creating Agent with final configuration:“)
logger.info(f” • Deepgram language: ‘{stt_language_code_for_deepgram}’ (auto-detect mode for turn detector)“)
logger.info(f” • Turn detector: {turn_detector_name}“)
logger.info(f” • Language supported by turn detector: {turn_detector_supported}“)
agent_session = AgentSession(
# llm=openai.realtime.RealtimeModel(model=“gpt-4o-realtime-preview”),
# llm=google.beta.realtime.RealtimeModel(
# model=“gemini-2.0-flash-live-001", instructions=system_content, api_key=envconfig.GEMINI_API_KEY
# ),
# vad=ctx.proc.userdata[“vad”],
# turn_detection=MultilingualModel(),
# llm=[
# openai.LLM(model=“gpt-4.1-mini”),
# anthropic.LLM(model=“claude-3-5-sonnet-20241022”),
# ]
vad=ctx.proc.userdata[“vad”],
turn_detection=selected_turn_detector,
min_consecutive_speech_delay=0.8,
stt=STTFallbackAdapter(
vad=ctx.proc.userdata[“vad”],
stt=[
deepgram.STT(
model=“nova-3-general”,
mip_opt_out=True,
filler_words=True,
language=stt_language_code_for_deepgram,
),
StreamAdapter(
stt=openai.STT(
model=“gpt-4o-transcribe”,
language=openai_language_code,
detect_language=openai_detect_language,
),
vad=ctx.proc.userdata[“vad”],
),
assemblyai.STT(),
],
attempt_timeout=20.0,
max_retry_per_stt=2,
retry_interval=5.0,
),
llm=LLMFallbackAdapter(
llm=[
anthropic.LLM(model=“claude-3-5-sonnet-20241022”),
google.LLM(model=“gemini-2.5-flash”, api_key=envconfig.GEMINI_API_KEY),
],
attempt_timeout=45.0,
max_retry_per_llm=2,
retry_interval=5.0,
),
tts=TTSFallbackAdapter(
tts=[get_voice(voices, languages)]
+ [
TTSStreamAdapter(
tts=openai.TTS(
voice=“alloy”, model=“gpt-4o-mini-tts”, instructions=openai_instructions(languages)
)
),
cartesia.TTS(voice=“97f4b8fb-f2fe-444b-bb9a-c109783a857a”, model=“sonic-2"),
],
max_retry_per_tts=4,
),
)
await agent_session.start(
room=ctx.room,
# agent=Agent(
# instructions=system_content
# # tts=google.TTS(credentials_info=envconfig.GOOGLE_CREDENTIALS, voice_name=“Puck”),
# ),
agent=Agent(
instructions=system_content,
),
room_input_options=RoomInputOptions(noise_cancellation=noise_cancellation.BVC(), close_on_disconnect=False),
)