miniature-nail-35141
02/17/2025, 10:46 AMgray-monkey-83525
02/17/2025, 10:46 AMminiature-nail-35141
02/17/2025, 10:48 AMgray-monkey-83525
02/17/2025, 10:48 AMminiature-nail-35141
02/17/2025, 10:49 AMgray-monkey-83525
02/17/2025, 10:49 AMimport asyncio
from livekit import rtc
from livekit.agents import JobContext, AutoSubscribe
from livekit.plugins import deepgram
async def transcribe_audio(ctx: JobContext):
# Connect to the room
await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
participant = await ctx.wait_for_participant()
# Initialize Deepgram STT
stt = deepgram.STT(model="nova-2-general", interim_results=True)
# Create a transcription stream
stt_stream = stt.stream()
async def process_audio(track: rtc.Track):
audio_stream = rtc.AudioStream(track)
async for frame in audio_stream:
stt_stream.push_frame(frame)
@ctx.room.on("track_subscribed")
def on_track_subscribed(track: rtc.Track, publication: rtc.TrackPublication, participant: rtc.RemoteParticipant):
if track.kind == rtc.TrackKind.KIND_AUDIO:
asyncio.create_task(process_audio(track))
# Process transcriptions
async for event in stt_stream:
if event.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
print("Final Transcript:", event.alternatives[0].text)
# Run the transcription task
if __name__ == "__main__":
asyncio.run(transcribe_audio(JobContext()))
This code sets up a connection to a LiveKit room, subscribes to audio tracks, and processes them using the Deepgram STT plugin. It prints the final transcriptions to the console.
Would you like more details or modifications to this code?
Sources:
- https://docs.livekit.io/python/livekit/plugins/deepgram/index.html#livekit.plugins.deepgram.STT
- https://docs.livekit.io/python/livekit/plugins/deepgram/index.html#livekit.plugins.deepgram.SpeechStream 👍 0 button 👎 0 button 💡 Hint: Mention @runllm in the thread for followups.miniature-nail-35141
02/17/2025, 10:51 AMgray-monkey-83525
02/17/2025, 10:52 AMimport asyncio
from livekit import rtc
from livekit.agents import JobContext, WorkerOptions, cli, llm
from livekit.agents.multimodal import MultimodalAgent
from livekit.plugins import deepgram, openai
async def entrypoint(ctx: JobContext):
# Connect to the room
await ctx.connect(auto_subscribe=rtc.AutoSubscribe.AUDIO_ONLY)
participant = await ctx.wait_for_participant()
# Initialize Deepgram STT
stt = deepgram.STT(model="nova-2-general", interim_results=True)
# Initialize OpenAI Realtime Model
model = openai.realtime.RealtimeModel(
instructions="You are a helpful assistant.",
voice="shimmer",
temperature=0.8,
modalities=["audio", "text"]
)
# Create a MultimodalAgent
assistant = MultimodalAgent(model=model)
# Start the agent
assistant.start(ctx.room)
# Create a transcription stream
stt_stream = stt.stream()
async def process_audio(track: rtc.Track):
audio_stream = rtc.AudioStream(track)
async for frame in audio_stream:
stt_stream.push_frame(frame)
@ctx.room.on("track_subscribed")
def on_track_subscribed(track: rtc.Track, publication: rtc.TrackPublication, participant: rtc.RemoteParticipant):
if track.kind == rtc.TrackKind.KIND_AUDIO:
asyncio.create_task(process_audio(track))
# Process transcriptions
async for event in stt_stream:
if event.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
print("Final Transcript:", event.alternatives[0].text)
if __name__ == "__main__":
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))
This code sets up a MultimodalAgent using OpenAI's Realtime API and integrates Deepgram STT for transcription. The agent connects to a LiveKit room, subscribes to audio tracks, and processes them using Deepgram STT, while the multimodal agent handles the conversation flow.
Would you like more details or modifications to this code?
Sources:
- Deepgram STT 👍 0 button 👎 0 button 💡 Hint: Mention @runllm in the thread for followups.miniature-nail-35141
02/17/2025, 10:56 AMgray-monkey-83525
02/17/2025, 10:56 AMimport asyncio
from livekit import rtc
from livekit.agents import JobContext, WorkerOptions, cli, llm
from livekit.agents.multimodal import MultimodalAgent
from livekit.plugins import openai
async def entrypoint(ctx: JobContext):
# Connect to the room
await ctx.connect(auto_subscribe=rtc.AutoSubscribe.AUDIO_ONLY)
participant = await ctx.wait_for_participant()
# Initialize OpenAI STT
stt = openai.STT(model="whisper-large-v3-turbo", interim_results=True)
# Initialize OpenAI Realtime Model
model = openai.realtime.RealtimeModel(
instructions="You are a helpful assistant.",
voice="shimmer",
temperature=0.8,
modalities=["audio", "text"]
)
# Create a MultimodalAgent
assistant = MultimodalAgent(model=model)
# Start the agent
assistant.start(ctx.room)
# Create a transcription stream
stt_stream = stt.stream()
async def process_audio(track: rtc.Track):
audio_stream = rtc.AudioStream(track)
async for frame in audio_stream:
stt_stream.push_frame(frame)
@ctx.room.on("track_subscribed")
def on_track_subscribed(track: rtc.Track, publication: rtc.TrackPublication, participant: rtc.RemoteParticipant):
if track.kind == rtc.TrackKind.KIND_AUDIO:
asyncio.create_task(process_audio(track))
# Process transcriptions
async for event in stt_stream:
if event.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
print("Final Transcript:", event.alternatives[0].text)
if __name__ == "__main__":
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))
This code sets up a MultimodalAgent using OpenAI's Realtime API and integrates OpenAI STT for transcription. The agent connects to a LiveKit room, subscribes to audio tracks, and processes them using OpenAI STT, while the multimodal agent handles the conversation flow.
Would you like more details or modifications to this code?
Sources:
- OpenAI STT 👍 0 button 👎 0 button 💡 Hint: Mention @runllm in the thread for followups.