3d:["$","div",null,{"className":"mt-2 flex flex-wrap items-center gap-3","children":[["$","$L42",null,{"episodeData":{"uuid":"ce9bf9d9-d407-4b1e-bb14-99571201b518","slug":"agent-bench-evaluating-llms-as-agents-ce9bf9d9","name":"Agent Bench: Evaluating LLMs as Agents","subtitle":null,"description":"

Large Language Models (LLMs) are rapidly evolving, but how do we assess their ability to act as agents in complex, real-world scenarios? Join Jenny as we explore Agent Bench, a new benchmark designed to evaluate LLMs in diverse environments, from operating systems to digital card games.

We'll delve into the key findings, including the strengths and weaknesses of different LLMs and the challenges of developing truly intelligent agents.

","imageUrl":"https://media.rss.com/ai-safety-breakthrough/ep_cover_20241127_121158_10c7ce1b5c4ee07a9e48355a800e0fad.png","audioUrl":"https://content.rss.com/episodes/302490/1773317/ai-safety-breakthrough/2024_11_27_00_20_05_8f162033-f12c-4844-9c0b-36f3d3bdd051.mp3","duration":798,"datePublished":1732666817,"transcript":null,"summary":null,"transcribeInProgress":null,"summaryInProgress":null,"podcastSeries":{"uuid":"0ee57855-efed-4e3e-ac5e-7cb3e2e625a4","name":"AI Safety Breakthrough","imageUrl":"https://media.rss.com/ai-safety-breakthrough/20241123_071129_ce16d0ad02796df5471abe5c1c97d328.png","authorName":"AI SafeGuard","itunesId":1781398468,"datePublished":1732482199,"description":"

The future of AI is in our hands. Join AI SafeGuard on \"AI Safety Breakthrough\" as we explore the frontiers of AI safety research and discuss how we can ensure a future where AI remains beneficial for everyone. We delve into the latest breakthroughs, uncover potential risks, and empower listeners to become informed participants in the conversation about AI's role in society. Subscribe now and become part of the solution!

Intro about the author

J, graduated from Carnegie Mellon University, School of Computer Science, 10+ years in Cybersecurity, Cyber Threat Intelligence, Risk, Compliance, privacy and AI Safety.

","slug":"ai-safety-breakthrough","language":"en"}}}],["$","$L43",null,{"seriesUuid":"0ee57855-efed-4e3e-ac5e-7cb3e2e625a4","size":"md"}],["$","$L44",null,{"title":"Agent Bench: Evaluating LLMs as Agents","url":"https://pod.wave.co/podcast/ai-safety-breakthrough/agent-bench-evaluating-llms-as-agents-ce9bf9d9"}],["$","$L45",null,{"episodeData":"$3d:props:children:0:props:episodeData"}]]}]