<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
    <channel>
        <title>Defending against AI jailbreaks</title>
        <link>https://tube.grossholtz.net/videos/watch/fd41499a-e6d3-403c-b265-75b58c4a48d2</link>
        <description>Anthropic researchers, Mrinank Sharma, Jerry Wei, Ethan Perez and Meg Tong discuss a system based on Constitutional Classifiers that guards models against jailbreaks. Read more: https://www.anthropic.com/news/constitutional-classifiers 0:00 Introduction 0:39 Defining jailbreaks and their importance 3:35 Universal jailbreaks 10:24 The Swiss cheese model for safety 11:25 Explaining Constitutional Classifiers 14:11 Ensuring model helpfulness 17:30 Understanding the constitution and synthetic data 19:00 Flexibility of the constitutional approach 24:15 Origins of the constitutional classifiers approach 32:24 Progress on robustness 38:47 The public demo: Purpose, setup 47:42 Understanding whether the approach is safe in practice 54:05 The public demo: Approaches people tried to bypass classifiers 56:14 Benefits of the classifier approach for Claude users 1:00:18 Memorable moments from the project 1:08:20 Differences in approach between this project and other research 1:11:11 The evolution of AI safety research</description>
        <lastBuildDate>Mon, 06 Apr 2026 03:01:45 GMT</lastBuildDate>
        <docs>https://validator.w3.org/feed/docs/rss2.html</docs>
        <generator>PeerTube - https://tube.grossholtz.net</generator>
        <image>
            <title>Defending against AI jailbreaks</title>
            <url>https://tube.grossholtz.net/client/assets/images/icons/icon-512x512.png</url>
            <link>https://tube.grossholtz.net/videos/watch/fd41499a-e6d3-403c-b265-75b58c4a48d2</link>
        </image>
        <copyright>All rights reserved, unless otherwise specified in the terms specified at https://tube.grossholtz.net/about and potential licenses granted by each content's rightholder.</copyright>
        <atom:link href="https://tube.grossholtz.net/feeds/video-comments.xml?videoId=fd41499a-e6d3-403c-b265-75b58c4a48d2" rel="self" type="application/rss+xml"/>
    </channel>
</rss>