d/2024-09-27.html


<!DOCTYPE html>
<html>
<head>
    <script async src="https://www.googletagmanager.com/gtag/js?id=G-C1CRWDNJ1J"></script>
    <script>
        window.dataLayer = window.dataLayer || [];
        function gtag(){dataLayer.push(arguments);}
        gtag('js', new Date());
        gtag('config', 'G-C1CRWDNJ1J');
    </script>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0"><title>HF. 12 papers. September 27.</title>
<link rel="icon" href="favicon.svg" sizes="any" type="image/svg+xml">
    <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;700&display=swap" rel="stylesheet">
    <link href="https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@100..900&family=Tiny5&display=swap" rel="stylesheet">
    <style>
        :root {
            --primary-color: cornflowerblue;
            --primary-color-dark: #fffd87cf;
            --secondary-color: #fff;
            --background-color: #eee;
            --text-color: #333333;
            --header-color: cornflowerblue;
            --body-color: #eee;
            --menu-color: #002370;
        }
        .background-digit {
            position: absolute;
            font-family: 'Tiny5';
            bottom: -20px;
            right: -10px;
            font-size: 8em;
            font-weight: 400;
            color: #0989ea22;
            z-index: 2;
            line-height: 1;
        }
        .dark-theme .background-digit {
            color: #e9e78f3d;
        }
        body {
            font-family: 'Roboto Slab', sans-serif;
            line-height: 1.6;
            color: var(--text-color);
            margin: 0;
            padding: 0;
            min-height: 100vh;
            display: flex;
            flex-direction: column;
        }
        .container {
            max-width: 1500px;
            margin: 0 auto;
            padding: 0 20px;
            flex: 1 0 auto;
        }
        .a-clean {
            color: var(--secondary-color);
            text-decoration: none;
        }
        .a-clean:hover {
            color: #fff;
        }
        header {
            padding: 3.6em 0 2.4em 0;
            text-align: center;
        }
        footer {
            background-color: var(--primary-color);
            color: white;
            text-align: center;
            margin-top: 2em;
            flex-shrink: 0;
            padding: 20px;
        }
        h1 {
            font-size: 2.4em;
            margin: 0;
            font-weight: 700;
        }
        .article-title-cont {
            margin: -21px -21px 0px -21px;
            padding: 10px 20px;
            background: cornflowerblue;
            display: table;
            min-height: 5.9em;
        }
        .dark-theme .article-title-cont {
            background: #444444;
        }
        .article-title {
            color: white;           
        }
        .article-title h2 {
            margin: 0px;
            padding: 0px;
            font-weight: 400;
            text-align:center;
        }
        h2 {
            # color: var(--primary-color);
            font-size: 1.2em;
            margin-top: 0;
            margin-bottom: 0.5em;
        }
        header p {
            font-size: 1.2em;
            margin-top: 0.5em;
            font-weight: 300;
        }
        main {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
            gap: 1.5em;
            padding: 10px 0 20px 0;
        }
        body.dark-tmeme>header {
            background-color: background-color: #333333;
            color: white;
        }
        body.dark-theme>div>main>article>div.article-content>p.meta {
            color: #fff;
        }
        body.light-theme>div>main>article>div.article-content>p.meta {
            color: #555;
        }
        body.dark-theme>div>main>article>div.article-content>p.pub-date {
            color: #ccc;
        }
        body.light-theme>div>main>article>div.article-content>p.pub-date {
            color: #555;
        }
        body.dark-theme>div>main>article>div.article-content>div.tags {
            color: #ccc;
        }
        body.light-theme>div>main>article>div.article-content>div.tags {
            color: #fff;
        }
        body.light-theme>header {
            background-color: var(--header-color);
            color: white;
        }
        article {
            border-radius: 5px;
            border: 1px solid #ddd;
            overflow: hidden;
            transition: background-color 0.2s ease;
            display: flex;
            flex-direction: column;
            position: relative;
        }
        .article-content {
            padding: 1.3em;
            flex-grow: 1;
            display: flex;
            flex-direction: column;
            position: relative;
            z-index: 1;
            cursor: pointer;
        }
        body.dark-theme>div>main>article {
            background-color: #444;
            border: none;
        }
        body.light-theme>div>main>article {
            background-color: #fff;
        }
        body.dark-theme>div>main>article:hover {
            background-color: #414141;
        }
        body.light-theme>div>main>article:hover {
            background-color: #fafafa;
        }
        .meta {
            font-size: 0.9em;
            margin-bottom: 0em;
            font-weight: 500;
            margin: 20px 0 0px 0;
            padding-bottom: 20px;
            border-bottom: 1px solid #ddd;
        }
        .pub-date {
            font-size: 0.8em;
            margin-bottom: 0.8em;
            font-weight: 400;
            text-align: right;
            font-family: Roboto;
        }
        .tags {
            font-size: 0.9em;
            margin-bottom: 0;
            position: absolute;
            bottom: 0px;
            font-weight: 300;
            font-family: 'Roboto Slab';
            background: #555;
            left: 0;
            width: 100%;
            padding: 10px 20px;
        }
        .abstract {
            position: relative;
            max-height: 170px;
            overflow: hidden;
            transition: max-height 0.3s ease;
            cursor: pointer;
        }
        .abstract.expanded {
            max-height: 1000px;
        }
        .abstract-toggle {
            position: absolute;
            bottom: 4px;
            right: 0;
            cursor: pointer;
            color: var(--primary-color);
            float: right;
            font-weight: 400;
        }
        .explanation {
            background-color: #e8f5e9;
            border-left: 4px solid var(--secondary-color);
            padding: 1em;
            margin-top: 1.5em;
        }
        .links {
            margin-top: 1.5em;
            margin-bottom: 20px;
        }
        .affiliations {
            margin-bottom: 50px;
            padding:10px;
            font-size: 0.9em;
            text-align: center
        }
        a {
            color: var(--primary-color);
            text-decoration: none;
            font-weight: 500;
            transition: color 0.3s ease;
        }
        .dark-theme a {
            color: var(--primary-color-dark);
        }
        a:hover {
            color: #e73838;
        }
        .light-theme {
            background-color: var(--body-color);
            color: #333333;
        }
        .dark-theme {
            background-color: #333333;
            color: #ffffff;
        }
        .theme-switch {
            position: absolute;
            top: 20px;
            right: 20px;
            display: flex;
            align-items: center;
        }
        .switch {
            position: relative;
            display: inline-block;
            width: 50px;
            height: 30px;
        }
        .switch input {
            opacity: 0;
            width: 0;
            height: 0;
        }
        .slider {
            position: absolute;
            cursor: pointer;
            top: 0;
            left: 0;
            right: 0;
            bottom: 0;
            background-color: #ccc;
            transition: .4s;
            border-radius: 30px;
        }
        .slider:before {
            position: absolute;
            content: "";
            height: 24px;
            width: 24px;
            left: 3px;
            bottom: 3px;
            background-color: white;
            transition: .4s;
            border-radius: 50%;
        }
        input:checked + .slider {
            background-color: var(--primary-color);
        }
        input:checked + .slider:before {
            transform: translateX(20px);
        }
        .switch-label {
            margin-right: 10px;
        }

        .sub-header-container {
            display: flex;
            justify-content: space-between;
            align-items: center;
            flex-wrap: wrap;
            gap: 15px;
            margin-top: 7px;
        }
        .sub-header-container-2 {
            display: flex;
            justify-content: left;
            align-items: center;
            flex-wrap: wrap;
            gap: 15px;
            margin: 0 auto;
        }
        .update-info-container {
            margin-top: 15px;
            margin-bottom: 0px;
            text-align: left;
            flex: 1;
        }
        .sort-container {
            margin-top: 15px;
            margin-bottom: 0px;
            text-align: right;
            flex: 2;
        }
        
        .category-toggle-container {
            display: inline-block;
            margin-top: 15px;
            margin-bottom: 10px;
            cursor: pointer;
        }
        .category-option-container {
            margin-top: 15px;
            margin-bottom: 10px;
            display: none;
            margin-left: auto;
        }
        .category-option-container.expanded {
            display: block;
        }

        .sort-dropdown {
            padding: 5px 10px;
            font-size: 16px;
            border-radius: 5px;
            border: 1px solid #ccc;
            background-color: white;
            color: var(--text-color);
            font-family: 'Roboto Slab', sans-serif;
        }
        .sort-label {
            margin-right: 10px;
            font-size: 1.0em !important;
        }        
        .dark-theme .sort-dropdown {
            background-color: #444;
            color: white;
            border-color: var(--text-color);
        }
        .title-sign {
            display: inline-block;
            transition: all 0.5s ease;            
        }
        .rotate {
            transform: rotate(45deg) translateY(-6px);
            transform-origin: center;
        }
        .title-text {
            display: inline;
            padding-left: 10px;
        }
        .category-filters {
            margin-top: 20px;
            margin-bottom: 20px;
            text-align: center;
            display: none;
        }
        .category-filters.expanded {
            display: block;
            margin-top: 10px;
        }
        .category-button {
            display: inline-block;
            margin: 5px;
            padding: 5px 10px;
            border-radius: 15px;
            background-color: #f0f0f0;
            color: #333;
            cursor: pointer;
            transition: background-color 0.3s ease;
        }
        .category-button.active {
            background-color: var(--primary-color);
            color: white;
        }
        .category-button.inactive:not(.active) {
            color: #ccc;
        }
        .dark-theme .category-button {
            background-color: #555;
            color: #fff;
        }
        .dark-theme .category-button.active {
            background-color: var(--primary-color);
        }
        .dark-theme .category-button.inactive:not(.active) {
            color: #888;
        }
        .clear-categories {
            display: inline-block;
            margin: 5px;
            padding: 5px 10px;
            border-radius: 15px;
            background-color: #f0f0f0;
            color: #333;
            cursor: pointer;
            transition: background-color 0.3s ease;
        }
        .clear-categories:hover {
            background-color: #bbb;
        }
        .svg-container {
            display: inline-block;
            position: relative;
            overflow: hidden;
        }
        .svg-container span {
            position: relative;
            z-index: 1;
        }
        .svg-container svg {
            position: absolute;
            bottom: 0;
            left: 0;
            z-index: 0;
        }

        .nav-menu {
            background-color: var(--menu-color);
            padding: 2px 0 2px 0;
            display: inline-block;
            position: relative;
            overflow: hidden;
            width: 100%;
        }        
        .nav-container {
            max-width: 1500px;
            margin: 0 auto;
            padding: 0 20px;
            display: flex;
            justify-content: left;
            gap: 3em;
        }
        .nav-container span a {
            color: white;
        }        
        .nav-item {
            color: white;
            padding: 3px 0px;
            cursor: pointer;
            font-weight: 400;
        }        
        .nav-item:hover {
            background-color: rgba(255, 255, 255, 0.1);
            border-color: rgba(255, 255, 255, 0.3);
        }        
        .language-flags {
            display: flex;
            gap: 7px;
            padding: 5px 0px;
            margin-left: auto;
        }
        .flag-svg {
            width: 22px;
            height: 22px;
            cursor: pointer;
            opacity: 0.4;
            transition: opacity 0.3s ease;
            border-radius: 2px;
        }
        .flag-svg.active {
            opacity: 1;
        }
        .flag-svg:hover {
            opacity: 0.8;
        }
        
        .dark-theme .nav-menu {
            background-color: #333;
        }
        .dark-theme .nav-item {
            color: white;
        }
        
        .dark-theme .nav-item:hover {
            background-color: rgba(255, 255, 255, 0.05);
        }

        .pointer { cursor: pointer; }

        .article-pdf-title-img {
            max-width: 100%;
            max-height: 400px;
            display: inline-block;
            margin-top: 10px;
            margin-bottom: 10px;
            border-radius: 5px;
        }
        .article-pdf-title-img-cont {
            text-align: center;
        }
        .dark-theme .article-pdf-title-img {
            opacity: 0.8;
            filter: grayscale(1);
        }

        @media (max-width: 600px) {
            .nav-container {
                flex-direction: row;
                gap: 1.5em;
            }            
            .nav-item {
                padding: 3px 0px;
            }
        }
        
        @media (max-width: 768px) {
            .category-filters {
                display: none;
            }
            .category-toggle {
                display: inline-block;
                width: 100%;
                text-align: left;
            }
            .category-filters.expanded {
                display: block;
                margin-top: 10px;
            }
        }
        @media (max-width: 600px) {
            .sub-header-container {
                flex-direction: column;
                align-items: flex-start;
            }
            .sort-container {
                width: 100%;
                display: flex;
                justify-content: left;
                margin: 0 auto;
            }
            .sort-dropdown {
                margin-left: auto;
            }
            .sort-label {
                margin-top: 5px;
                float: left;
            }

            .sub-header-container-2 {
                flex-direction: row;
                align-items: flex-start;
            }
            .update-info-container {
                text-align: left;
                width: 100%;
                margin-bottom: 0px;
            }
            .category-toggle-container {
                margin-top: 15px;
                text-align: left;
                margin-bottom: 10px;
            }
            .category-option-container {
                margin-top: 15px;
                text-align: center;
                margin-bottom: 10px;
            }            
            main {
                grid-template-columns: repeat(auto-fit);
                gap: 0em;
                padding: 10px 0 20px 0;
                margin: 0 -20px;
            }
            footer {
                margin-top: -20px;
            }
            article {
                border-radius: 0px;
            }
        }
    </style>
    <script>
    function toggleAbstract(id) {
        var abstract = document.getElementById('abstract-' + id);
        var toggle = document.getElementById('toggle-' + id);
        if (abstract.classList.contains('expanded')) {
            abstract.classList.remove('expanded');
            toggle.textContent = '...';
        } else {
            abstract.classList.add('expanded');
            toggle.textContent = '';
        }
    }
    function getTimeDiff(dateString, lang='ru') {
        const timeUnits = {
            ru: {
                minute: ["минуту", "минуты", "минут"],
                hour: ["час", "часа", "часов"],
                day: ["день", "дня", "дней"],
                justNow: "только что",
                ago: "назад"
            },
            en: {
                minute: ["minute", "minutes", "minutes"],
                hour: ["hour", "hours", "hours"],
                day: ["day", "days", "days"],
                justNow: "just now",
                ago: "ago"
            },
            zh: {
                minute: ["分钟", "分钟", "分钟"],
                hour: ["小时", "小时", "小时"],
                day: ["天", "天", "天"],
                justNow: "刚刚",
                ago: "前"
            }
        };

        function getPlural(number, words, lang) {
            if (lang === 'ru') {
                if (number % 10 === 1 && number % 100 !== 11) {
                    return words[0];
                } else if (number % 10 >= 2 && number % 10 <= 4 && (number % 100 < 10 || number % 100 >= 20)) {
                    return words[1];
                } else {
                    return words[2];
                }
            } else if (lang === 'en') {
                return number === 1 ? words[0] : words[1];
            } else {
                // Chinese doesn't need plural forms
                return words[0];
            }
        }

        function formatTimeDiff(number, unit, lang) {
            const unitWord = getPlural(number, timeUnits[lang][unit], lang);
            
            if (lang === 'zh') {
                return `${number}${unitWord}${timeUnits[lang].ago}`;
            } else {
                return `${number} ${unitWord} ${timeUnits[lang].ago}`;
            }
        }

        if (!['ru', 'en', 'zh'].includes(lang)) {
            throw new Error('Unsupported language. Supported languages are: ru, en, zh');
        }

        const pastDate = new Date(dateString.replace(" ", "T") + ":00Z");
        const currentDate = new Date();
        const diffInSeconds = Math.floor((currentDate - pastDate) / 1000);
        
        const minutes = Math.floor(diffInSeconds / 60);
        const hours = Math.floor(diffInSeconds / 3600);
        const days = Math.floor(diffInSeconds / 86400);

        if (minutes === 0) {
            return timeUnits[lang].justNow;
        } else if (minutes < 60) {
            return formatTimeDiff(minutes, 'minute', lang);
        } else if (hours < 24) {
            return formatTimeDiff(hours, 'hour', lang);
        } else {
            return formatTimeDiff(days, 'day', lang);
        }
    }
    function isToday(dateString) {
        const inputDate = new Date(dateString);
        const today = new Date();
        return (
            inputDate.getFullYear() === today.getFullYear() &&
            inputDate.getMonth() === today.getMonth() &&
            inputDate.getDate() === today.getDate()
        );
    }
    function isCurrentMonth(dateString) {
        const inputDate = new Date(dateString);
        const today = new Date();
        return (
            inputDate.getFullYear() === today.getFullYear() &&
            inputDate.getMonth() === today.getMonth()
        );
    }
    function formatArticlesTitle(number, lang='ru') {
        const lastDigit = number % 10;
        const lastTwoDigits = number % 100;
        let word;

        if (!['ru', 'en', 'zh'].includes(lang)) {
            throw new Error('Unsupported language. Supported languages are: ru, en, zh');
        }

        if (lang === 'ru') {
            if (lastTwoDigits >= 11 && lastTwoDigits <= 14) {
                word = "статей";
            } else if (lastDigit === 1) {
                word = "статья";
            } else if (lastDigit >= 2 && lastDigit <= 4) {
                word = "статьи";
            } else {
                word = "статей";
            }
        } else if (lang === 'en') {
            if (number === 1) {
                word = 'paper'
            } else {
                word = 'papers'
            }
        } else if (lang === 'zh') {
            word = "篇论文"
        }

        if (lang === 'zh') {
            return `${number}${word}`;
        } else {
            return `${number} ${word}`;
        }
    }
    </script>
</head>
<body class="light-theme">
    <header>
        <div class="container">            
            <a href="https://hfday.ru" class="a-clean"><h1 class="title-sign" id="doomgrad-icon">🔺</h1><h1 class="title-text" id="doomgrad">hf daily</h1></a>
            <p><span id="title-date">27 сентября</span> | <span id="title-articles-count">12 papers</span></p>
        </div>
        <div class="theme-switch">
            <label class="switch">
                <input type="checkbox" id="theme-toggle">
                <span class="slider"></span>
            </label>
        </div>
    </header>
    <div class="nav-menu">
        <div class="nav-container">
            <span class="nav-item" id="nav-prev"><a href="/d/2024-09-26.html">⬅️ <span id="prev-date">26.09</span></a></span>
            <span class="nav-item" id="nav-next"><a href="/d/2024-09-30.html">➡️ <span id="next-date">30.09</span></a></span>
            <span class="nav-item" id="nav-monthly"><a href="/m/2024-09.html">📈 <span id='top-month-label'>Месяц</span></a></span>
            <div class="language-flags">
                <svg class="flag-svg" data-lang="ru" xmlns="http://www.w3.org/2000/svg" width="32" height="32" viewBox="0 0 32 32"><path fill="#1435a1" d="M1 11H31V21H1z"></path><path d="M5,4H27c2.208,0,4,1.792,4,4v4H1v-4c0-2.208,1.792-4,4-4Z" fill="#fff"></path><path d="M5,20H27c2.208,0,4,1.792,4,4v4H1v-4c0-2.208,1.792-4,4-4Z" transform="rotate(180 16 24)" fill="#c53a28"></path><path d="M27,4H5c-2.209,0-4,1.791-4,4V24c0,2.209,1.791,4,4,4H27c2.209,0,4-1.791,4-4V8c0-2.209-1.791-4-4-4Zm3,20c0,1.654-1.346,3-3,3H5c-1.654,0-3-1.346-3-3V8c0-1.654,1.346-3,3-3H27c1.654,0,3,1.346,3,3V24Z" opacity=".15"></path><path d="M27,5H5c-1.657,0-3,1.343-3,3v1c0-1.657,1.343-3,3-3H27c1.657,0,3,1.343,3,3v-1c0-1.657-1.343-3-3-3Z" fill="#fff" opacity=".2"></path></svg>
                <svg class="flag-svg" data-lang="zh" xmlns="http://www.w3.org/2000/svg" width="32" height="32" viewBox="0 0 32 32"><rect x="1" y="4" width="30" height="24" rx="4" ry="4" fill="#db362f"></rect><path d="M27,4H5c-2.209,0-4,1.791-4,4V24c0,2.209,1.791,4,4,4H27c2.209,0,4-1.791,4-4V8c0-2.209-1.791-4-4-4Zm3,20c0,1.654-1.346,3-3,3H5c-1.654,0-3-1.346-3-3V8c0-1.654,1.346-3,3-3H27c1.654,0,3,1.346,3,3V24Z" opacity=".15"></path><path fill="#ff0" d="M7.958 10.152L7.19 7.786 6.421 10.152 3.934 10.152 5.946 11.614 5.177 13.979 7.19 12.517 9.202 13.979 8.433 11.614 10.446 10.152 7.958 10.152z"></path><path fill="#ff0" d="M12.725 8.187L13.152 8.898 13.224 8.072 14.032 7.886 13.269 7.562 13.342 6.736 12.798 7.361 12.035 7.037 12.461 7.748 11.917 8.373 12.725 8.187z"></path><path fill="#ff0" d="M14.865 10.372L14.982 11.193 15.37 10.46 16.187 10.602 15.61 10.007 15.997 9.274 15.253 9.639 14.675 9.044 14.793 9.865 14.048 10.23 14.865 10.372z"></path><path fill="#ff0" d="M15.597 13.612L16.25 13.101 15.421 13.13 15.137 12.352 14.909 13.149 14.081 13.179 14.769 13.642 14.541 14.439 15.194 13.928 15.881 14.391 15.597 13.612z"></path><path fill="#ff0" d="M13.26 15.535L13.298 14.707 12.78 15.354 12.005 15.062 12.46 15.754 11.942 16.402 12.742 16.182 13.198 16.875 13.236 16.047 14.036 15.827 13.26 15.535z"></path><path d="M27,5H5c-1.657,0-3,1.343-3,3v1c0-1.657,1.343-3,3-3H27c1.657,0,3,1.343,3,3v-1c0-1.657-1.343-3-3-3Z" fill="#fff" opacity=".2"></path></svg>
                <svg class="flag-svg" data-lang="en" xmlns="http://www.w3.org/2000/svg" width="32" height="32" viewBox="0 0 32 32"><rect x="1" y="4" width="30" height="24" rx="4" ry="4" fill="#fff"></rect><path d="M1.638,5.846H30.362c-.711-1.108-1.947-1.846-3.362-1.846H5c-1.414,0-2.65,.738-3.362,1.846Z" fill="#a62842"></path><path d="M2.03,7.692c-.008,.103-.03,.202-.03,.308v1.539H31v-1.539c0-.105-.022-.204-.03-.308H2.03Z" fill="#a62842"></path><path fill="#a62842" d="M2 11.385H31V13.231H2z"></path><path fill="#a62842" d="M2 15.077H31V16.923000000000002H2z"></path><path fill="#a62842" d="M1 18.769H31V20.615H1z"></path><path d="M1,24c0,.105,.023,.204,.031,.308H30.969c.008-.103,.031-.202,.031-.308v-1.539H1v1.539Z" fill="#a62842"></path><path d="M30.362,26.154H1.638c.711,1.108,1.947,1.846,3.362,1.846H27c1.414,0,2.65-.738,3.362-1.846Z" fill="#a62842"></path><path d="M5,4h11v12.923H1V8c0-2.208,1.792-4,4-4Z" fill="#102d5e"></path><path d="M27,4H5c-2.209,0-4,1.791-4,4V24c0,2.209,1.791,4,4,4H27c2.209,0,4-1.791,4-4V8c0-2.209-1.791-4-4-4Zm3,20c0,1.654-1.346,3-3,3H5c-1.654,0-3-1.346-3-3V8c0-1.654,1.346-3,3-3H27c1.654,0,3,1.346,3,3V24Z" opacity=".15"></path><path d="M27,5H5c-1.657,0-3,1.343-3,3v1c0-1.657,1.343-3,3-3H27c1.657,0,3,1.343,3,3v-1c0-1.657-1.343-3-3-3Z" fill="#fff" opacity=".2"></path><path fill="#fff" d="M4.601 7.463L5.193 7.033 4.462 7.033 4.236 6.338 4.01 7.033 3.279 7.033 3.87 7.463 3.644 8.158 4.236 7.729 4.827 8.158 4.601 7.463z"></path><path fill="#fff" d="M7.58 7.463L8.172 7.033 7.441 7.033 7.215 6.338 6.989 7.033 6.258 7.033 6.849 7.463 6.623 8.158 7.215 7.729 7.806 8.158 7.58 7.463z"></path><path fill="#fff" d="M10.56 7.463L11.151 7.033 10.42 7.033 10.194 6.338 9.968 7.033 9.237 7.033 9.828 7.463 9.603 8.158 10.194 7.729 10.785 8.158 10.56 7.463z"></path><path fill="#fff" d="M6.066 9.283L6.658 8.854 5.927 8.854 5.701 8.158 5.475 8.854 4.744 8.854 5.335 9.283 5.109 9.979 5.701 9.549 6.292 9.979 6.066 9.283z"></path><path fill="#fff" d="M9.046 9.283L9.637 8.854 8.906 8.854 8.68 8.158 8.454 8.854 7.723 8.854 8.314 9.283 8.089 9.979 8.68 9.549 9.271 9.979 9.046 9.283z"></path><path fill="#fff" d="M12.025 9.283L12.616 8.854 11.885 8.854 11.659 8.158 11.433 8.854 10.702 8.854 11.294 9.283 11.068 9.979 11.659 9.549 12.251 9.979 12.025 9.283z"></path><path fill="#fff" d="M6.066 12.924L6.658 12.494 5.927 12.494 5.701 11.799 5.475 12.494 4.744 12.494 5.335 12.924 5.109 13.619 5.701 13.19 6.292 13.619 6.066 12.924z"></path><path fill="#fff" d="M9.046 12.924L9.637 12.494 8.906 12.494 8.68 11.799 8.454 12.494 7.723 12.494 8.314 12.924 8.089 13.619 8.68 13.19 9.271 13.619 9.046 12.924z"></path><path fill="#fff" d="M12.025 12.924L12.616 12.494 11.885 12.494 11.659 11.799 11.433 12.494 10.702 12.494 11.294 12.924 11.068 13.619 11.659 13.19 12.251 13.619 12.025 12.924z"></path><path fill="#fff" d="M13.539 7.463L14.13 7.033 13.399 7.033 13.173 6.338 12.947 7.033 12.216 7.033 12.808 7.463 12.582 8.158 13.173 7.729 13.765 8.158 13.539 7.463z"></path><path fill="#fff" d="M4.601 11.104L5.193 10.674 4.462 10.674 4.236 9.979 4.01 10.674 3.279 10.674 3.87 11.104 3.644 11.799 4.236 11.369 4.827 11.799 4.601 11.104z"></path><path fill="#fff" d="M7.58 11.104L8.172 10.674 7.441 10.674 7.215 9.979 6.989 10.674 6.258 10.674 6.849 11.104 6.623 11.799 7.215 11.369 7.806 11.799 7.58 11.104z"></path><path fill="#fff" d="M10.56 11.104L11.151 10.674 10.42 10.674 10.194 9.979 9.968 10.674 9.237 10.674 9.828 11.104 9.603 11.799 10.194 11.369 10.785 11.799 10.56 11.104z"></path><path fill="#fff" d="M13.539 11.104L14.13 10.674 13.399 10.674 13.173 9.979 12.947 10.674 12.216 10.674 12.808 11.104 12.582 11.799 13.173 11.369 13.765 11.799 13.539 11.104z"></path><path fill="#fff" d="M4.601 14.744L5.193 14.315 4.462 14.315 4.236 13.619 4.01 14.315 3.279 14.315 3.87 14.744 3.644 15.44 4.236 15.01 4.827 15.44 4.601 14.744z"></path><path fill="#fff" d="M7.58 14.744L8.172 14.315 7.441 14.315 7.215 13.619 6.989 14.315 6.258 14.315 6.849 14.744 6.623 15.44 7.215 15.01 7.806 15.44 7.58 14.744z"></path><path fill="#fff" d="M10.56 14.744L11.151 14.315 10.42 14.315 10.194 13.619 9.968 14.315 9.237 14.315 9.828 14.744 9.603 15.44 10.194 15.01 10.785 15.44 10.56 14.744z"></path><path fill="#fff" d="M13.539 14.744L14.13 14.315 13.399 14.315 13.173 13.619 12.947 14.315 12.216 14.315 12.808 14.744 12.582 15.44 13.173 15.01 13.765 15.44 13.539 14.744z"></path></svg>
            </div>
        </div>
    </div>
    <div class="container">
        <div class="sub-header-container">
            <div class="update-info-container">
                <label class="update-info-label" id="timeDiff"></label>
            </div>
            <div class="sort-container">
                <label class="sort-label">🔀 <span id="sort-label-text">Сортировка по</span></label>
                <select id="sort-dropdown" class="sort-dropdown">
                    <option value="default">рейтингу</option>
                    <option value="pub_date">дате публикации</option>
                    <option value="issue_id">добавлению на HF</option>
                </select>
            </div>
        </div>
        <div class="sub-header-container-2">
            <div class="category-toggle-container">
                <div class="svg-container">
                    <span id="category-toggle">🏷️ Фильтр</span>
                    <svg height="3" width="200">
                        <line x1="0" y1="0" x2="200" y2="0" 
                            stroke="black" 
                            stroke-width="2" 
                            stroke-dasharray="3, 3" />
                    </svg>
                </div>
            </div>
            <div class="category-option-container" id="category-options">                
                <label class="pointer" for="filter-logic-or"><input type="radio" id="filter-logic-or" name="filter-logic" value="or"> A∪B</label>
                <label class="pointer" for="filter-logic-and"><input type="radio" id="filter-logic-and" name="filter-logic" value="and"> A∩B</label>
            </div> 
        </div>
        <div class="category-filters" id="category-filters">
            <span class="clear-categories" id="clear-categories">🧹</span>
            <!-- Categories -->
        </div>
        <main id="articles-container">
            <!-- Articles -->
        </main>
    </div>
    <footer>
        <div class="container">
            <p><a style="color:white;" href="https://t.me/doomgrad">doomgrad</a> ✖️ <a style="color:white;" href="https://huggingface.co/papers">hugging face</a></p>
        </div>
    </footer>
    <script>
        // Language handling
        let currentLang = localStorage.getItem('selectedLang') || 'en';
        let feedDate = {'ru': '27 сентября', 'en': 'September 27', 'zh': '9月27日'};
        let feedDateNext = {'ru': '30.09', 'en': '09/30', 'zh': '9月30日'};
        let feedDatePrev = {'ru': '26.09', 'en': '09/26', 'zh': '9月26日'};
        let filterLabel = {'ru': 'Фильтр', 'en': 'Topics', 'zh': '主题筛选'}
        let publishedLabel = {'ru': 'статья от ', 'en': 'published on ', 'zh': '发表于'}
        let sortLabel = {'ru': 'Сортировка по', 'en': 'Sort by', 'zh': '排序方式'}
        let paperLabel = {'ru': 'Статья', 'en': 'Paper', 'zh': '论文'}
        let topMonthLabel = {'ru': 'Месяц', 'en': 'Month', 'zh': '月度论文'}
        let topDayLabel = {'ru': 'День', 'en': 'Day', 'zh': '日度论文'}
        
        function initializeLanguageFlags() {
            const flags = document.querySelectorAll('.flag-svg');
            flags.forEach(flag => {
                if (flag.dataset.lang === currentLang) {
                    flag.classList.add('active');
                }
                flag.addEventListener('click', () => {
                    flags.forEach(f => f.classList.remove('active'));
                    flag.classList.add('active');
                    currentLang = flag.dataset.lang;
                    localStorage.setItem('selectedLang', currentLang);
                    updateTimeDiffs();
                    updateLocalization();
                    filterAndRenderArticles();
                });
            });
        }
        function toggleTheme() {
            const body = document.body;
            body.classList.toggle('light-theme');
            body.classList.toggle('dark-theme');

            const isDarkMode = body.classList.contains('dark-theme');
            localStorage.setItem('darkMode', isDarkMode);
            
            if (isDarkMode) {
                const title = document.getElementById('doomgrad');
                title.innerHTML = "hf nightly";
                const titleSign = document.getElementById('doomgrad-icon');
                titleSign.classList.add('rotate');
            }  else {
                const title = document.getElementById('doomgrad');
                title.innerHTML = "hf daily";
                const titleSign = document.getElementById('doomgrad-icon');
                titleSign.classList.remove('rotate');
            }
        }

        const articlesData = [{'id': 'https://huggingface.co/papers/2409.17481', 'title': 'MaskLLM: Learnable Semi-Structured Sparsity for Large Language Models', 'url': 'https://huggingface.co/papers/2409.17481', 'abstract': "Large Language Models (LLMs) are distinguished by their massive parameter counts, which typically result in significant redundancy. This work introduces MaskLLM, a learnable pruning method that establishes Semi-structured (or ``N:M'') Sparsity in LLMs, aimed at reducing computational overhead during inference. Instead of developing a new importance criterion, MaskLLM explicitly models N:M patterns as a learnable distribution through Gumbel Softmax sampling. This approach facilitates end-to-end training on large-scale datasets and offers two notable advantages: 1) High-quality Masks - our method effectively scales to large datasets and learns accurate masks; 2) Transferability - the probabilistic modeling of mask distribution enables the transfer learning of sparsity across domains or tasks. We assessed MaskLLM using 2:4 sparsity on various LLMs, including LLaMA-2, Nemotron-4, and GPT-3, with sizes ranging from 843M to 15B parameters, and our empirical results show substantial improvements over state-of-the-art methods. For instance, leading approaches achieve a perplexity (PPL) of 10 or greater on Wikitext compared to the dense model's 5.12 PPL, but MaskLLM achieves a significantly lower 6.72 PPL solely by learning the masks with frozen weights. Furthermore, MaskLLM's learnable nature allows customized masks for lossless application of 2:4 sparsity to downstream tasks or domains. Code is available at https://github.com/NVlabs/MaskLLM.", 'score': 46, 'issue_id': 1, 'pub_date': '2024-09-26', 'pub_date_card': {'ru': '26 сентября', 'en': 'September 26', 'zh': '9月26日'}, 'hash': '9bb73b25aad1001a', 'authors': ['Gongfan Fang', 'Hongxu Yin', 'Saurav Muralidharan', 'Greg Heinrich', 'Jeff Pool', 'Jan Kautz', 'Pavlo Molchanov', 'Xinchao Wang'], 'affiliations': ['NVIDIA', 'National University of Singapore'], 'pdf_title_img': 'assets\\pdf\\title_img\\2409.17481.jpg', 'data': {'categories': ['#dataset', '#training', '#inference', '#optimization', '#transfer_learning', '#open_source', '#architecture'], 'emoji': '✂️', 'ru': {'title': 'MaskLLM: Эффективное обучение разреженности в больших языковых моделях', 'desc': 'Статья представляет MaskLLM - метод обучаемой обрезки для создания полуструктурированной разреженности в больших языковых моделях (LLM). MaskLLM моделирует паттерны N:M как обучаемое распределение с помощью выборки Гумбеля-Софтмакса, что позволяет проводить сквозное обучение на крупномасштабных наборах данных. Метод обеспечивает высококачественные маски и возможность переноса обучения разреженности между доменами или задачами. Эмпирические результаты показывают значительные улучшения по сравнению с современными методами при применении 2:4 разреженности к различным LLM.'}, 'en': {'title': 'Efficient Pruning of Large Language Models with MaskLLM', 'desc': 'This paper presents MaskLLM, a novel method for pruning large language models (LLMs) by introducing Semi-structured (N:M) sparsity to reduce computational costs during inference. MaskLLM utilizes Gumbel Softmax sampling to model N:M patterns as a learnable distribution, allowing for end-to-end training on extensive datasets. The method not only generates high-quality masks that scale effectively but also enables transfer learning of sparsity across different tasks. Empirical results demonstrate that MaskLLM outperforms existing methods, achieving lower perplexity scores while maintaining the ability to apply customized masks for various downstream applications.'}, 'zh': {'title': 'MaskLLM：高效稀疏化的大型语言模型', 'desc': '大型语言模型（LLMs）通常具有大量参数，导致计算冗余。本文提出了一种名为MaskLLM的可学习剪枝方法，通过建立半结构化（或“N:M”）稀疏性来减少推理过程中的计算开销。MaskLLM通过Gumbel Softmax采样显式建模N:M模式，支持在大规模数据集上进行端到端训练。实验结果表明，MaskLLM在多个LLM上实现了显著的性能提升，且其可学习特性使得在不同任务或领域间的稀疏性转移成为可能。'}}}, {'id': 'https://huggingface.co/papers/2409.18042', 'title': 'EMOVA: Empowering Language Models to See, Hear and Speak with Vivid Emotions', 'url': 'https://huggingface.co/papers/2409.18042', 'abstract': 'GPT-4o, an omni-modal model that enables vocal conversations with diverse emotions and tones, marks a milestone for omni-modal foundation models. However, empowering Large Language Models to perceive and generate images, texts, and speeches end-to-end with publicly available data remains challenging in the open-source community. Existing vision-language models rely on external tools for the speech processing, while speech-language models still suffer from limited or even without vision-understanding abilities. To address this gap, we propose EMOVA (EMotionally Omni-present Voice Assistant), to enable Large Language Models with end-to-end speech capabilities while maintaining the leading vision-language performance. With a semantic-acoustic disentangled speech tokenizer, we notice surprisingly that omni-modal alignment can further enhance vision-language and speech abilities compared with the corresponding bi-modal aligned counterparts. Moreover, a lightweight style module is proposed for flexible speech style controls (e.g., emotions and pitches). For the first time, EMOVA achieves state-of-the-art performance on both the vision-language and speech benchmarks, and meanwhile, supporting omni-modal spoken dialogue with vivid emotions.', 'score': 36, 'issue_id': 1, 'pub_date': '2024-09-26', 'pub_date_card': {'ru': '26 сентября', 'en': 'September 26', 'zh': '9月26日'}, 'hash': '227cd783a8a6d39c', 'authors': ['Kai Chen', 'Yunhao Gou', 'Runhui Huang', 'Zhili Liu', 'Daxin Tan', 'Jing Xu', 'Chunwei Wang', 'Yi Zhu', 'Yihan Zeng', 'Kuo Yang', 'Dingdong Wang', 'Kun Xiang', 'Haoyuan Li', 'Haoli Bai', 'Jianhua Han', 'Xiaohui Li', 'Weike Jin', 'Nian Xie', 'Yu Zhang', 'James T. Kwok', 'Hengshuang Zhao', 'Xiaodan Liang', 'Dit-Yan Yeung', 'Xiao Chen', 'Zhenguo Li', 'Wei Zhang', 'Qun Liu', 'Jun Yao', 'Lanqing Hong', 'Lu Hou', 'Hang Xu'], 'affiliations': ['Hong Kong University of Science and Technology', 'Huawei Noahs Ark Lab', 'Southern University of Science and Technology', 'Sun Yat-sen University', 'The Chinese University of Hong Kong', 'The University of Hong Kong'], 'pdf_title_img': 'assets\\pdf\\title_img\\2409.18042.jpg', 'data': {'categories': ['#audio', '#cv', '#benchmark', '#alignment', '#open_source', '#architecture', '#synthetic', '#multimodal'], 'emoji': '🗣️', 'ru': {'title': 'EMOVA: прорыв в омнимодальном ИИ с эмоциональным речевым интерфейсом', 'desc': 'EMOVA - это омнимодальная модель, объединяющая возможности обработки изображений, текста и речи. Она использует семантико-акустический разделенный токенизатор речи для улучшения языковых и речевых способностей. EMOVA достигает передовых результатов как в задачах зрения-языка, так и в речевых тестах. Модель также поддерживает омнимодальный разговорный диалог с различными эмоциями.'}, 'en': {'title': 'EMOVA: Bridging Speech and Vision for Emotionally Intelligent Conversations', 'desc': 'The paper introduces EMOVA, a new model designed to enhance Large Language Models (LLMs) by integrating speech capabilities with vision-language performance. EMOVA utilizes a semantic-acoustic disentangled speech tokenizer, which allows for better alignment between visual and auditory data, improving overall model performance. Additionally, it features a lightweight style module that enables control over speech styles, such as emotions and pitches. This approach achieves state-of-the-art results in both vision-language and speech tasks, facilitating more expressive and emotionally aware spoken dialogues.'}, 'zh': {'title': '情感全能语音助手：打破模态界限的创新', 'desc': '本论文介绍了EMOVA（情感全能语音助手），这是一个能够实现端到端语音能力的大型语言模型。EMOVA通过语义-声学解耦的语音标记器，提升了视觉-语言和语音能力的对齐效果。与现有的双模态模型相比，EMOVA在视觉-语言和语音基准测试中都达到了最先进的性能。该模型还引入了轻量级风格模块，支持灵活的语音风格控制，如情感和音调。'}}}, {'id': 'https://huggingface.co/papers/2409.18125', 'title': 'LLaVA-3D: A Simple yet Effective Pathway to Empowering LMMs with 3D-awareness', 'url': 'https://huggingface.co/papers/2409.18125', 'abstract': 'Recent advancements in Large Multimodal Models (LMMs) have greatly enhanced their proficiency in 2D visual understanding tasks, enabling them to effectively process and understand images and videos. However, the development of LMMs with 3D-awareness for 3D scene understanding has been hindered by the lack of large-scale 3D vision-language datasets and powerful 3D encoders. In this paper, we introduce a simple yet effective framework called LLaVA-3D. Leveraging the strong 2D understanding priors from LLaVA, our LLaVA-3D efficiently adapts LLaVA for 3D scene understanding without compromising 2D understanding capabilities. To achieve this, we employ a simple yet effective representation, 3D Patch, which connects 2D CLIP patch features with their corresponding positions in 3D space. By integrating the 3D Patches into 2D LMMs and employing joint 2D and 3D vision-language instruction tuning, we establish a unified architecture for both 2D image understanding and 3D scene understanding. Experimental results show that LLaVA-3D converges 3.5x faster than existing 3D LMMs when trained on 3D vision-language datasets. Moreover, LLaVA-3D not only achieves state-of-the-art performance across various 3D tasks but also maintains comparable 2D image understanding and vision-language conversation capabilities with LLaVA.', 'score': 33, 'issue_id': 1, 'pub_date': '2024-09-26', 'pub_date_card': {'ru': '26 сентября', 'en': 'September 26', 'zh': '9月26日'}, 'hash': '4ca82aa848fc15ec', 'authors': ['Chenming Zhu', 'Tai Wang', 'Wenwei Zhang', 'Jiangmiao Pang', 'Xihui Liu'], 'affiliations': ['Shanghai AI Laboratory', 'The University of Hong Kong'], 'pdf_title_img': 'assets\\pdf\\title_img\\2409.18125.jpg', 'data': {'categories': ['#dataset', '#cv', '#training', '#graphs', '#optimization', '#transfer_learning', '#architecture', '#multimodal', '#3d'], 'emoji': '🧠', 'ru': {'title': 'LLaVA-3D: Эффективный переход от 2D к 3D пониманию для мультимодальных моделей', 'desc': 'Статья представляет LLaVA-3D - фреймворк для адаптации моделей 2D понимания изображений к задачам 3D понимания сцен. Авторы используют концепцию 3D Patch, связывающую 2D признаки CLIP с их позициями в 3D пространстве. LLaVA-3D обучается быстрее существующих 3D моделей и достигает state-of-the-art результатов в 3D задачах. При этом модель сохраняет способности к пониманию 2D изображений на уровне базовой LLaVA.'}, 'en': {'title': 'Bridging 2D and 3D: LLaVA-3D Unifies Visual Understanding', 'desc': 'This paper presents LLaVA-3D, a framework designed to enhance Large Multimodal Models (LMMs) for 3D scene understanding while retaining their 2D visual comprehension abilities. The authors address the challenge of limited 3D vision-language datasets and the need for robust 3D encoders by introducing a novel representation called 3D Patch, which links 2D features to their 3D spatial locations. By integrating these 3D Patches into existing 2D LMMs and utilizing joint instruction tuning, LLaVA-3D achieves a unified approach for processing both 2D and 3D data. Experimental results demonstrate that LLaVA-3D trains 3.5 times faster than current 3D LMMs and excels in various 3D tasks while maintaining strong performance in 2D image understanding.'}, 'zh': {'title': 'LLaVA-3D：统一2D与3D场景理解的创新框架', 'desc': '本文介绍了一种新的框架LLaVA-3D，旨在提升大型多模态模型（LMMs）在3D场景理解方面的能力。通过结合2D CLIP特征与3D空间位置，LLaVA-3D有效地将2D理解能力扩展到3D场景中。该框架采用简单有效的3D Patch表示，并通过联合的2D和3D视觉语言指令调优，建立了统一的架构。实验结果表明，LLaVA-3D在训练速度上比现有的3D LMMs快3.5倍，并在多个3D任务上实现了最先进的性能，同时保持了与LLaVA相当的2D图像理解能力。'}}}, {'id': 'https://huggingface.co/papers/2409.18124', 'title': 'Lotus: Diffusion-based Visual Foundation Model for High-quality Dense Prediction', 'url': 'https://huggingface.co/papers/2409.18124', 'abstract': 'Leveraging the visual priors of pre-trained text-to-image diffusion models offers a promising solution to enhance zero-shot generalization in dense prediction tasks. However, existing methods often uncritically use the original diffusion formulation, which may not be optimal due to the fundamental differences between dense prediction and image generation. In this paper, we provide a systemic analysis of the diffusion formulation for the dense prediction, focusing on both quality and efficiency. And we find that the original parameterization type for image generation, which learns to predict noise, is harmful for dense prediction; the multi-step noising/denoising diffusion process is also unnecessary and challenging to optimize. Based on these insights, we introduce Lotus, a diffusion-based visual foundation model with a simple yet effective adaptation protocol for dense prediction. Specifically, Lotus is trained to directly predict annotations instead of noise, thereby avoiding harmful variance. We also reformulate the diffusion process into a single-step procedure, simplifying optimization and significantly boosting inference speed. Additionally, we introduce a novel tuning strategy called detail preserver, which achieves more accurate and fine-grained predictions. Without scaling up the training data or model capacity, Lotus achieves SoTA performance in zero-shot depth and normal estimation across various datasets. It also significantly enhances efficiency, being hundreds of times faster than most existing diffusion-based methods.', 'score': 31, 'issue_id': 1, 'pub_date': '2024-09-26', 'pub_date_card': {'ru': '26 сентября', 'en': 'September 26', 'zh': '9月26日'}, 'hash': '55be564bbee47eed', 'authors': ['Jing He', 'Haodong Li', 'Wei Yin', 'Yixun Liang', 'Leheng Li', 'Kaiqiang Zhou', 'Hongbo Zhang', 'Bingbing Liu', 'Ying-Cong Chen'], 'affiliations': ['HKUST', 'HKUST(GZ)', 'Noahs Ark Lab', 'University of Adelaide'], 'pdf_title_img': 'assets\\pdf\\title_img\\2409.18124.jpg', 'data': {'categories': ['#dataset', '#cv', '#inference', '#optimization', '#transfer_learning', '#diffusion', '#architecture'], 'emoji': '🌸', 'ru': {'title': 'Lotus: Эффективное плотное предсказание с помощью оптимизированной диффузионной модели', 'desc': 'Статья представляет Lotus - новую модель машинного обучения для решения задач плотного предсказания на основе диффузионных моделей. Авторы предлагают изменения в стандартной формулировке диффузионного процесса, оптимизируя его для задач плотного предсказания. Lotus обучается напрямую предсказывать аннотации вместо шума и использует одношаговый процесс диффузии, что значительно ускоряет вывод. Модель достигает передовых результатов в задачах оценки глубины и нормалей без дополнительного обучения.'}, 'en': {'title': 'Lotus: Revolutionizing Dense Prediction with Efficient Diffusion', 'desc': 'This paper presents Lotus, a new diffusion-based visual foundation model designed to improve zero-shot generalization in dense prediction tasks. The authors analyze the limitations of traditional diffusion methods, which are primarily suited for image generation, and highlight their inefficiencies when applied to dense prediction. By directly predicting annotations instead of noise and reformulating the diffusion process into a single-step procedure, Lotus simplifies optimization and enhances inference speed. The model achieves state-of-the-art performance in depth and normal estimation without requiring additional training data or increased model size.'}, 'zh': {'title': 'Lotus：高效的密集预测扩散模型', 'desc': '本文提出了一种新的方法，利用预训练的文本到图像扩散模型来提高密集预测任务的零-shot泛化能力。我们分析了现有扩散模型在密集预测中的不足，发现原有的噪声预测参数化方式对密集预测有害。为此，我们引入了Lotus模型，直接预测标注而非噪声，并将扩散过程简化为单步程序，从而提高了优化效率和推理速度。Lotus在多个数据集上实现了最先进的零-shot深度和法线估计性能，同时在效率上也大幅提升。'}}}, {'id': 'https://huggingface.co/papers/2409.14254', 'title': 'Instruction Following without Instruction Tuning', 'url': 'https://huggingface.co/papers/2409.14254', 'abstract': "Instruction tuning commonly means finetuning a language model on instruction-response pairs. We discover two forms of adaptation (tuning) that are deficient compared to instruction tuning, yet still yield instruction following; we call this implicit instruction tuning. We first find that instruction-response pairs are not necessary: training solely on responses, without any corresponding instructions, yields instruction following. This suggests pretrained models have an instruction-response mapping which is revealed by teaching the model the desired distribution of responses. However, we then find it's not necessary to teach the desired distribution of responses: instruction-response training on narrow-domain data like poetry still leads to broad instruction-following behavior like recipe generation. In particular, when instructions are very different from those in the narrow finetuning domain, models' responses do not adhere to the style of the finetuning domain. To begin to explain implicit instruction tuning, we hypothesize that very simple changes to a language model's distribution yield instruction following. We support this by hand-writing a rule-based language model which yields instruction following in a product-of-experts with a pretrained model. The rules are to slowly increase the probability of ending the sequence, penalize repetition, and uniformly change 15 words' probabilities. In summary, adaptations made without being designed to yield instruction following can do so implicitly.", 'score': 27, 'issue_id': 1, 'pub_date': '2024-09-21', 'pub_date_card': {'ru': '21 сентября', 'en': 'September 21', 'zh': '9月21日'}, 'hash': '928d018d2936e022', 'authors': ['John Hewitt', 'Nelson F. Liu', 'Percy Liang', 'Christopher D. Manning'], 'affiliations': ['Department of Computer Science, Stanford University'], 'pdf_title_img': 'assets\\pdf\\title_img\\2409.14254.jpg', 'data': {'categories': ['#reasoning', '#training', '#interpretability', '#alignment', '#architecture'], 'emoji': '🧠', 'ru': {'title': 'Скрытые возможности языковых моделей: неявное обучение следованию инструкциям', 'desc': "Исследователи обнаружили, что языковые модели могут научиться следовать инструкциям без явного обучения на парах инструкция-ответ. Этот феномен назван 'неявной настройкой на инструкции'. Выяснилось, что достаточно обучения только на ответах или даже на узкоспециализированных данных для получения широких навыков следования инструкциям. Авторы предполагают, что даже простые изменения в распределении вероятностей языковой модели могут привести к способности следовать инструкциям."}, 'en': {'title': 'Unlocking Instruction Following Without Explicit Instructions', 'desc': "This paper explores a new concept called implicit instruction tuning, which shows that language models can learn to follow instructions even without explicit instruction-response pairs. The authors demonstrate that training a model solely on responses can still lead to effective instruction following, suggesting that pretrained models already have an inherent understanding of instruction-response mappings. They also find that training on narrow-domain data can produce broad instruction-following behavior, indicating that the model can generalize beyond its training context. The study proposes that simple adjustments to a model's output distribution can facilitate this implicit learning process."}, 'zh': {'title': '隐式指令调优：无需指令也能实现指令跟随', 'desc': '本文探讨了指令调优的概念，发现有两种适应形式虽然不如指令调优有效，但仍能实现指令跟随。研究表明，仅通过响应进行训练，而不需要对应的指令，也能使模型遵循指令。这表明预训练模型内部存在指令与响应的映射关系。此外，作者提出简单的模型调整可以实现指令跟随，甚至在狭窄领域的数据上进行训练也能产生广泛的指令跟随行为。'}}}, {'id': 'https://huggingface.co/papers/2409.17422', 'title': 'Discovering the Gems in Early Layers: Accelerating Long-Context LLMs with 1000x Input Token Reduction', 'url': 'https://huggingface.co/papers/2409.17422', 'abstract': 'Large Language Models (LLMs) have demonstrated remarkable capabilities in handling long context inputs, but this comes at the cost of increased computational resources and latency. Our research introduces a novel approach for the long context bottleneck to accelerate LLM inference and reduce GPU memory consumption. Our research demonstrates that LLMs can identify relevant tokens in the early layers before generating answers to a query. Leveraging this insight, we propose an algorithm that uses early layers of an LLM as filters to select and compress input tokens, significantly reducing the context length for subsequent processing. Our method, GemFilter, demonstrates substantial improvements in both speed and memory efficiency compared to existing techniques, such as standard attention and SnapKV/H2O. Notably, it achieves a 2.4times speedup and 30\\% reduction in GPU memory usage compared to SOTA methods. Evaluation on the Needle in a Haystack task shows that GemFilter significantly outperforms standard attention, SnapKV and demonstrates comparable performance on the LongBench challenge. GemFilter is simple, training-free, and broadly applicable across different LLMs. Crucially, it provides interpretability by allowing humans to inspect the selected input sequence. These findings not only offer practical benefits for LLM deployment, but also enhance our understanding of LLM internal mechanisms, paving the way for further optimizations in LLM design and inference. Our code is available at https://github.com/SalesforceAIResearch/GemFilter.', 'score': 24, 'issue_id': 1, 'pub_date': '2024-09-25', 'pub_date_card': {'ru': '25 сентября', 'en': 'September 25', 'zh': '9月25日'}, 'hash': '830f07f8f88f0a79', 'authors': ['Zhenmei Shi', 'Yifei Ming', 'Xuan-Phi Nguyen', 'Yingyu Liang', 'Shafiq Joty'], 'affiliations': [], 'pdf_title_img': 'assets\\pdf\\title_img\\2409.17422.jpg', 'data': {'categories': ['#long_context', '#training', '#inference', '#interpretability', '#optimization', '#open_source', '#architecture'], 'emoji': '🚀', 'ru': {'title': 'GemFilter: Ускорение LLM без потери качества', 'desc': 'Исследователи представили новый метод GemFilter для ускорения вывода больших языковых моделей (LLM) и уменьшения потребления памяти GPU при работе с длинным контекстом. GemFilter использует ранние слои LLM в качестве фильтров для выбора и сжатия входных токенов, значительно сокращая длину контекста для последующей обработки. Метод демонстрирует существенное улучшение скорости и эффективности использования памяти по сравнению с существующими техниками. GemFilter также обеспечивает интерпретируемость, позволяя людям проверять выбранную входную последовательность.'}, 'en': {'title': 'Accelerating LLMs with Efficient Token Filtering', 'desc': 'This paper presents GemFilter, a new method designed to improve the efficiency of Large Language Models (LLMs) when processing long context inputs. By utilizing early layers of the LLM to filter and compress input tokens, GemFilter reduces the amount of data that needs to be processed in later layers, leading to faster inference times and lower GPU memory usage. The results show that GemFilter achieves a 2.4 times speedup and a 30% reduction in memory consumption compared to state-of-the-art techniques. Additionally, it provides interpretability by allowing users to examine the selected input tokens, enhancing both practical deployment and understanding of LLMs.'}, 'zh': {'title': 'GemFilter：加速大型语言模型的推理与内存优化', 'desc': '大型语言模型（LLMs）在处理长上下文输入方面表现出色，但这需要更多的计算资源和延迟。我们的研究提出了一种新方法，旨在加速LLM推理并减少GPU内存消耗。我们发现LLMs可以在生成答案之前，在早期层识别相关的输入标记。基于这一发现，我们提出的GemFilter算法利用LLM的早期层作为过滤器，选择和压缩输入标记，从而显著减少后续处理的上下文长度。'}}}, {'id': 'https://huggingface.co/papers/2409.17565', 'title': 'Pixel-Space Post-Training of Latent Diffusion Models', 'url': 'https://huggingface.co/papers/2409.17565', 'abstract': 'Latent diffusion models (LDMs) have made significant advancements in the field of image generation in recent years. One major advantage of LDMs is their ability to operate in a compressed latent space, allowing for more efficient training and deployment. However, despite these advantages, challenges with LDMs still remain. For example, it has been observed that LDMs often generate high-frequency details and complex compositions imperfectly. We hypothesize that one reason for these flaws is due to the fact that all pre- and post-training of LDMs are done in latent space, which is typically 8 times 8 lower spatial-resolution than the output images. To address this issue, we propose adding pixel-space supervision in the post-training process to better preserve high-frequency details. Experimentally, we show that adding a pixel-space objective significantly improves both supervised quality fine-tuning and preference-based post-training by a large margin on a state-of-the-art DiT transformer and U-Net diffusion models in both visual quality and visual flaw metrics, while maintaining the same text alignment quality.', 'score': 19, 'issue_id': 1, 'pub_date': '2024-09-26', 'pub_date_card': {'ru': '26 сентября', 'en': 'September 26', 'zh': '9月26日'}, 'hash': 'fa618de81a80ad24', 'authors': ['Christina Zhang', 'Simran Motwani', 'Matthew Yu', 'Ji Hou', 'Felix Juefei-Xu', 'Sam Tsai', 'Peter Vajda', 'Zijian He', 'Jialiang Wang'], 'affiliations': ['Meta GenAI, Menlo Park, CA', 'Princeton University, Princeton, NJ'], 'pdf_title_img': 'assets\\pdf\\title_img\\2409.17565.jpg', 'data': {'categories': ['#cv', '#training', '#optimization', '#diffusion', '#architecture'], 'emoji': '🖼️', 'ru': {'title': 'Улучшение качества генерации изображений через пиксельный контроль в латентных диффузионных моделях', 'desc': 'Латентные диффузионные модели (LDM) значительно продвинулись в области генерации изображений, но всё ещё имеют проблемы с высокочастотными деталями и сложными композициями. Авторы предполагают, что это связано с обучением в латентном пространстве с низким разрешением. Они предлагают добавить контроль в пиксельном пространстве при пост-обучении для улучшения качества деталей. Эксперименты показывают, что этот подход значительно улучшает качество изображений и уменьшает визуальные дефекты в современных диффузионных моделях.'}, 'en': {'title': 'Enhancing Image Quality in Latent Diffusion Models with Pixel-Space Supervision', 'desc': 'Latent diffusion models (LDMs) are advanced techniques for generating images, leveraging a compressed latent space for efficient training. However, they struggle with producing high-frequency details and complex compositions accurately. This paper suggests that the issue arises because LDMs operate in a lower resolution latent space during training. To improve the quality of generated images, the authors propose incorporating pixel-space supervision in the post-training phase, which significantly enhances visual quality without compromising text alignment.'}, 'zh': {'title': '提升图像生成质量的潜在空间监督', 'desc': '潜在扩散模型（LDMs）在图像生成领域取得了显著进展。LDMs的一个主要优点是能够在压缩的潜在空间中操作，从而实现更高效的训练和部署。然而，LDMs仍然面临一些挑战，例如生成高频细节和复杂构图时的不足。为了解决这个问题，我们提出在后期训练过程中增加像素空间监督，以更好地保留高频细节，并通过实验验证了这一方法的有效性。'}}}, {'id': 'https://huggingface.co/papers/2409.14195', 'title': 'The Imperative of Conversation Analysis in the Era of LLMs: A Survey of Tasks, Techniques, and Trends', 'url': 'https://huggingface.co/papers/2409.14195', 'abstract': 'In the era of large language models (LLMs), a vast amount of conversation logs will be accumulated thanks to the rapid development trend of language UI. Conversation Analysis (CA) strives to uncover and analyze critical information from conversation data, streamlining manual processes and supporting business insights and decision-making. The need for CA to extract actionable insights and drive empowerment is becoming increasingly prominent and attracting widespread attention. However, the lack of a clear scope for CA leads to a dispersion of various techniques, making it difficult to form a systematic technical synergy to empower business applications. In this paper, we perform a thorough review and systematize CA task to summarize the existing related work. Specifically, we formally define CA task to confront the fragmented and chaotic landscape in this field, and derive four key steps of CA from conversation scene reconstruction, to in-depth attribution analysis, and then to performing targeted training, finally generating conversations based on the targeted training for achieving the specific goals. In addition, we showcase the relevant benchmarks, discuss potential challenges and point out future directions in both industry and academia. In view of current advancements, it is evident that the majority of efforts are still concentrated on the analysis of shallow conversation elements, which presents a considerable gap between the research and business, and with the assist of LLMs, recent work has shown a trend towards research on causality and strategic tasks which are sophisticated and high-level. The analyzed experiences and insights will inevitably have broader application value in business operations that target conversation logs.', 'score': 11, 'issue_id': 1, 'pub_date': '2024-09-21', 'pub_date_card': {'ru': '21 сентября', 'en': 'September 21', 'zh': '9月21日'}, 'hash': 'fc04ee445bfa493b', 'authors': ['Xinghua Zhang', 'Haiyang Yu', 'Yongbin Li', 'Minzheng Wang', 'Longze Chen', 'Fei Huang'], 'affiliations': ['Alibaba Group, China'], 'pdf_title_img': 'assets\\pdf\\title_img\\2409.14195.jpg', 'data': {'categories': ['#science', '#survey', '#training', '#data', '#benchmark', '#multimodal'], 'emoji': '💬', 'ru': {'title': 'Анализ разговоров: от поверхностного анализа к глубокому пониманию с помощью LLM', 'desc': 'Эта статья посвящена анализу разговоров (Conversation Analysis, CA) в контексте широкого распространения больших языковых моделей (LLM). Авторы систематизируют задачи CA, выделяя четыре ключевых этапа: реконструкция сцены разговора, глубокий анализ атрибуций, целевое обучение и генерация разговоров для достижения конкретных целей. В работе обсуждаются существующие методики, потенциальные проблемы и будущие направления исследований в этой области. Отмечается, что большинство текущих исследований сосредоточено на анализе поверхностных элементов разговора, но с помощью LLM наблюдается тенденция к изучению более сложных аспектов, таких как причинно-следственные связи и стратегические задачи.'}, 'en': {'title': 'Empowering Business Insights through Systematic Conversation Analysis', 'desc': 'This paper reviews the field of Conversation Analysis (CA) in the context of large language models (LLMs) and their ability to process conversation logs. It defines the CA task systematically, outlining four key steps: reconstructing conversation scenes, conducting in-depth attribution analysis, performing targeted training, and generating conversations for specific goals. The authors highlight the current focus on shallow conversation elements and the need for deeper analysis to bridge the gap between research and practical business applications. They also discuss benchmarks, challenges, and future directions for CA in both industry and academia, emphasizing the potential of LLMs to enhance strategic conversation tasks.'}, 'zh': {'title': '系统化对话分析，驱动商业洞察', 'desc': '在大型语言模型（LLMs）时代，随着语言用户界面的快速发展，积累了大量的对话日志。对话分析（CA）旨在从对话数据中提取和分析关键信息，以简化手动流程并支持商业洞察和决策。本文对CA任务进行了全面回顾和系统化，明确了CA的定义，并提出了从对话场景重建到深入归因分析、再到针对性训练的四个关键步骤。通过展示相关基准和讨论潜在挑战，本文指出了行业和学术界未来的发展方向。'}}}, {'id': 'https://huggingface.co/papers/2409.17280', 'title': 'Disco4D: Disentangled 4D Human Generation and Animation from a Single Image', 'url': 'https://huggingface.co/papers/2409.17280', 'abstract': 'We present Disco4D, a novel Gaussian Splatting framework for 4D human generation and animation from a single image. Different from existing methods, Disco4D distinctively disentangles clothings (with Gaussian models) from the human body (with SMPL-X model), significantly enhancing the generation details and flexibility. It has the following technical innovations. 1) Disco4D learns to efficiently fit the clothing Gaussians over the SMPL-X Gaussians. 2) It adopts diffusion models to enhance the 3D generation process, e.g., modeling occluded parts not visible in the input image. 3) It learns an identity encoding for each clothing Gaussian to facilitate the separation and extraction of clothing assets. Furthermore, Disco4D naturally supports 4D human animation with vivid dynamics. Extensive experiments demonstrate the superiority of Disco4D on 4D human generation and animation tasks. Our visualizations can be found in https://disco-4d.github.io/.', 'score': 9, 'issue_id': 1, 'pub_date': '2024-09-25', 'pub_date_card': {'ru': '25 сентября', 'en': 'September 25', 'zh': '9月25日'}, 'hash': 'b076d30e6256f634', 'authors': ['Hui En Pang', 'Shuai Liu', 'Zhongang Cai', 'Lei Yang', 'Tianwei Zhang', 'Ziwei Liu'], 'affiliations': ['S-Lab, Nanyang Technological University', 'SenseTime Research', 'Shanghai AI Laboratory'], 'pdf_title_img': 'assets\\pdf\\title_img\\2409.17280.jpg', 'data': {'categories': ['#cv', '#games', '#diffusion', '#architecture', '#3d'], 'emoji': '👕', 'ru': {'title': 'Реалистичная генерация и анимация 3D-людей из одного фото', 'desc': 'Disco4D - это новая система для генерации и анимации 3D-моделей людей по одному изображению, основанная на методе Gaussian Splatting. Она отделяет одежду от тела человека, используя гауссовы модели для одежды и модель SMPL-X для тела. Система применяет диффузионные модели для улучшения процесса 3D-генерации и обучает кодирование идентичности для каждого гауссиана одежды. Disco4D позволяет создавать реалистичную 4D-анимацию людей с динамическими эффектами.'}, 'en': {'title': 'Revolutionizing 4D Human Generation with Disco4D', 'desc': 'Disco4D is a new framework that uses Gaussian Splatting to create and animate 4D human figures from just one image. It separates clothing from the human body using Gaussian models and the SMPL-X model, which improves detail and flexibility in the generated images. The framework incorporates diffusion models to better generate 3D representations, even for parts of the body that are not visible in the original image. Additionally, it includes a unique identity encoding for clothing, allowing for easier management of clothing assets and enabling dynamic 4D animations.'}, 'zh': {'title': 'Disco4D：从单图像生成动态4D人类模型', 'desc': 'Disco4D是一种新颖的高斯点云框架，用于从单张图像生成和动画化4D人类模型。与现有方法不同，Disco4D将服装（使用高斯模型）与人体（使用SMPL-X模型）有效分离，从而显著提高了生成的细节和灵活性。该方法通过高效拟合服装高斯模型和SMPL-X高斯模型，采用扩散模型增强3D生成过程，并为每个服装高斯学习身份编码，以便于分离和提取服装资产。此外，Disco4D自然支持生动的4D人类动画。'}}}, {'id': 'https://huggingface.co/papers/2409.14683', 'title': 'Reducing the Footprint of Multi-Vector Retrieval with Minimal Performance Impact via Token Pooling', 'url': 'https://huggingface.co/papers/2409.14683', 'abstract': 'Over the last few years, multi-vector retrieval methods, spearheaded by ColBERT, have become an increasingly popular approach to Neural IR. By storing representations at the token level rather than at the document level, these methods have demonstrated very strong retrieval performance, especially in out-of-domain settings. However, the storage and memory requirements necessary to store the large number of associated vectors remain an important drawback, hindering practical adoption. In this paper, we introduce a simple clustering-based token pooling approach to aggressively reduce the number of vectors that need to be stored. This method can reduce the space & memory footprint of ColBERT indexes by 50% with virtually no retrieval performance degradation. This method also allows for further reductions, reducing the vector count by 66%-to-75% , with degradation remaining below 5% on a vast majority of datasets. Importantly, this approach requires no architectural change nor query-time processing, and can be used as a simple drop-in during indexation with any ColBERT-like model.', 'score': 8, 'issue_id': 1, 'pub_date': '2024-09-23', 'pub_date_card': {'ru': '23 сентября', 'en': 'September 23', 'zh': '9月23日'}, 'hash': 'd7dda0c648e6ab9d', 'authors': ['Benjamin Clavié', 'Antoine Chaffin', 'Griffin Adams'], 'affiliations': ['Answer.AI Japan', 'Answer.AI USA', 'LightOn France'], 'pdf_title_img': 'assets\\pdf\\title_img\\2409.14683.jpg', 'data': {'categories': ['#rag', '#inference', '#graphs', '#optimization', '#data', '#benchmark'], 'emoji': '🗜️', 'ru': {'title': 'Эффективное сжатие индексов ColBERT без потери качества поиска', 'desc': 'Статья представляет новый подход к уменьшению объема хранимых векторов в многовекторных методах информационного поиска, таких как ColBERT. Авторы предлагают метод кластеризации токенов, который позволяет сократить объем индексов ColBERT на 50% без существенной потери производительности. Дальнейшее сокращение до 66-75% приводит к снижению эффективности менее чем на 5% для большинства наборов данных. Важно отметить, что этот метод не требует изменений в архитектуре модели и может быть легко интегрирован в процесс индексации.'}, 'en': {'title': 'Efficient Token Storage for Enhanced Retrieval Performance', 'desc': 'This paper presents a new method to improve multi-vector retrieval systems, particularly those based on ColBERT. The authors propose a clustering-based token pooling technique that significantly reduces the number of token-level vectors stored, addressing the high storage and memory demands of existing methods. Their approach can cut the storage requirements by 50% without losing retrieval accuracy, and even achieve reductions of 66% to 75% with minimal performance degradation. Importantly, this method is easy to implement, requiring no changes to the existing architecture or query processing, making it a practical enhancement for ColBERT-like models.'}, 'zh': {'title': '聚类池化：高效存储与检索的完美结合', 'desc': '近年来，多向量检索方法在神经信息检索中越来越受欢迎，尤其是ColBERT方法。该方法通过在标记级别存储表示，而不是在文档级别，展示了强大的检索性能，尤其是在域外设置中。然而，存储大量相关向量所需的存储和内存要求仍然是一个重要缺点，限制了其实际应用。本文提出了一种基于聚类的标记池化方法，可以大幅减少需要存储的向量数量，且几乎不影响检索性能。'}}}, {'id': 'https://huggingface.co/papers/2409.17580', 'title': 'Enhancing Structured-Data Retrieval with GraphRAG: Soccer Data Case Study', 'url': 'https://huggingface.co/papers/2409.17580', 'abstract': "Extracting meaningful insights from large and complex datasets poses significant challenges, particularly in ensuring the accuracy and relevance of retrieved information. Traditional data retrieval methods such as sequential search and index-based retrieval often fail when handling intricate and interconnected data structures, resulting in incomplete or misleading outputs. To overcome these limitations, we introduce Structured-GraphRAG, a versatile framework designed to enhance information retrieval across structured datasets in natural language queries. Structured-GraphRAG utilizes multiple knowledge graphs, which represent data in a structured format and capture complex relationships between entities, enabling a more nuanced and comprehensive retrieval of information. This graph-based approach reduces the risk of errors in language model outputs by grounding responses in a structured format, thereby enhancing the reliability of results. We demonstrate the effectiveness of Structured-GraphRAG by comparing its performance with that of a recently published method using traditional retrieval-augmented generation. Our findings show that Structured-GraphRAG significantly improves query processing efficiency and reduces response times. While our case study focuses on soccer data, the framework's design is broadly applicable, offering a powerful tool for data analysis and enhancing language model applications across various structured domains.", 'score': 7, 'issue_id': 1, 'pub_date': '2024-09-26', 'pub_date_card': {'ru': '26 сентября', 'en': 'September 26', 'zh': '9月26日'}, 'hash': 'c7496beca8061db3', 'authors': ['Zahra Sepasdar', 'Sushant Gautam', 'Cise Midoglu', 'Michael A. Riegler', 'Pål Halvorsen'], 'affiliations': ['Forzasys', 'OsloMet', 'SimulaMet'], 'pdf_title_img': 'assets\\pdf\\title_img\\2409.17580.jpg', 'data': {'categories': ['#reasoning', '#graphs', '#rag', '#data', '#interpretability', '#architecture'], 'emoji': '🕸️', 'ru': {'title': 'Графовый подход для точного извлечения данных', 'desc': 'Статья представляет Structured-GraphRAG - новый фреймворк для улучшения извлечения информации из сложных структурированных датасетов. Он использует множественные графы знаний для более точного и полного поиска данных. Structured-GraphRAG повышает надежность результатов языковых моделей, опираясь на структурированный формат. Эксперименты показали значительное улучшение эффективности обработки запросов по сравнению с традиционными методами.'}, 'en': {'title': 'Revolutionizing Data Retrieval with Structured-GraphRAG', 'desc': 'This paper presents Structured-GraphRAG, a new framework aimed at improving information retrieval from complex datasets using natural language queries. It addresses the shortcomings of traditional methods like sequential search by leveraging multiple knowledge graphs, which organize data and highlight relationships between entities. By grounding language model outputs in structured data, Structured-GraphRAG enhances the accuracy and relevance of the retrieved information. The framework has been shown to significantly boost query processing efficiency and is applicable to various domains beyond the soccer data case study.'}, 'zh': {'title': '提升结构化数据检索的效率与准确性', 'desc': '本论文介绍了一种名为Structured-GraphRAG的信息检索框架，旨在提高对结构化数据集的检索效率。传统的数据检索方法在处理复杂数据时常常无法提供准确的信息，导致结果不完整或误导。Structured-GraphRAG利用多个知识图谱，以结构化的方式表示数据，捕捉实体之间的复杂关系，从而实现更全面的信息检索。通过与传统的检索增强生成方法进行比较，我们的研究表明，Structured-GraphRAG在查询处理效率和响应时间上都有显著改善。'}}}, {'id': 'https://huggingface.co/papers/2409.18121', 'title': 'Robot See Robot Do: Imitating Articulated Object Manipulation with Monocular 4D Reconstruction', 'url': 'https://huggingface.co/papers/2409.18121', 'abstract': "Humans can learn to manipulate new objects by simply watching others; providing robots with the ability to learn from such demonstrations would enable a natural interface specifying new behaviors. This work develops Robot See Robot Do (RSRD), a method for imitating articulated object manipulation from a single monocular RGB human demonstration given a single static multi-view object scan. We first propose 4D Differentiable Part Models (4D-DPM), a method for recovering 3D part motion from a monocular video with differentiable rendering. This analysis-by-synthesis approach uses part-centric feature fields in an iterative optimization which enables the use of geometric regularizers to recover 3D motions from only a single video. Given this 4D reconstruction, the robot replicates object trajectories by planning bimanual arm motions that induce the demonstrated object part motion. By representing demonstrations as part-centric trajectories, RSRD focuses on replicating the demonstration's intended behavior while considering the robot's own morphological limits, rather than attempting to reproduce the hand's motion. We evaluate 4D-DPM's 3D tracking accuracy on ground truth annotated 3D part trajectories and RSRD's physical execution performance on 9 objects across 10 trials each on a bimanual YuMi robot. Each phase of RSRD achieves an average of 87% success rate, for a total end-to-end success rate of 60% across 90 trials. Notably, this is accomplished using only feature fields distilled from large pretrained vision models -- without any task-specific training, fine-tuning, dataset collection, or annotation. Project page: https://robot-see-robot-do.github.io", 'score': 7, 'issue_id': 1, 'pub_date': '2024-09-26', 'pub_date_card': {'ru': '26 сентября', 'en': 'September 26', 'zh': '9月26日'}, 'hash': '1397b774b882bc6c', 'authors': ['Justin Kerr', 'Chung Min Kim', 'Mingxuan Wu', 'Brent Yi', 'Qianqian Wang', 'Ken Goldberg', 'Angjoo Kanazawa'], 'affiliations': ['UC Berkeley'], 'pdf_title_img': 'assets\\pdf\\title_img\\2409.18121.jpg', 'data': {'categories': ['#cv', '#optimization', '#games', '#open_source', '#architecture', '#robotics', '#3d'], 'emoji': '🤖', 'ru': {'title': 'Роботы учатся манипулировать объектами, наблюдая за людьми', 'desc': 'Статья представляет метод Robot See Robot Do (RSRD) для имитации манипуляций с шарнирными объектами роботами на основе наблюдения за действиями человека. Авторы предлагают технику 4D Differentiable Part Models (4D-DPM) для восстановления трехмерного движения частей объекта из монокулярного видео с помощью дифференцируемого рендеринга. RSRD использует восстановленную 4D-реконструкцию для планирования движений робота, воспроизводящих траектории частей объекта. Метод достигает 60% успеха в физическом выполнении задач без специфического обучения или аннотаций данных.'}, 'en': {'title': 'Learning by Watching: Robots Imitate Human Object Manipulation', 'desc': 'This paper introduces Robot See Robot Do (RSRD), a method that allows robots to learn how to manipulate objects by observing human demonstrations. It utilizes 4D Differentiable Part Models (4D-DPM) to extract 3D motion information from a single monocular video, enabling the robot to understand and replicate the intended object movements. The approach focuses on part-centric trajectories, allowing the robot to plan its arm motions based on the demonstrated behavior while respecting its own physical capabilities. The method shows promising results, achieving an average success rate of 87% in tracking and 60% in execution across multiple trials without requiring specific training or data collection.'}, 'zh': {'title': '让机器人通过观察学习新技能', 'desc': '本研究提出了一种名为机器人看机器人做（RSRD）的方法，使机器人能够通过观察人类的单一演示来学习操控物体。我们首先引入了4D可微分部件模型（4D-DPM），该模型能够从单目视频中恢复3D部件运动。RSRD通过规划双手臂运动来复制物体轨迹，专注于再现演示的意图行为，而不是简单模仿手的动作。实验结果显示，RSRD在多个物体上的成功率达到87%，并且在没有特定任务训练的情况下实现了60%的整体成功率。'}}}];
        const articlesContainer = document.getElementById('articles-container');
        const sortDropdown = document.getElementById('sort-dropdown');
        const categoryFiltersContainer = document.getElementById('category-filters');
        const categoryFiltersLogicOptions = document.getElementById('category-options');
        const categoryToggle = document.getElementById('category-toggle');
        const clearCategoriesButton = document.getElementById('clear-categories');
        let selectedCategories = [];
        let selectedArticles = [];
        let sortBy = 'issue_id';     
        let showLimitHint = false; 
        let filterLogicIsAnd = false;

        function getUrlParameters() {
            const urlParams = new URLSearchParams(window.location.search);
            const categoriesParam = urlParams.get('cat');
            let categories = categoriesParam ? categoriesParam.split(',') : [];
            categories = categories.map(element => `#${element}`);
            return categories
        }

        function updateUrlWithCategories() {
            let cleanedCategories = selectedCategories.map(element => element.replace(/^#/, ''));
            const newUrl = cleanedCategories.length > 0 
                ? `${window.location.pathname}?cat=${cleanedCategories.join(',')}`
                : window.location.pathname;
            console.log("cleanedCategories", cleanedCategories)
            window.history.pushState({}, '', newUrl);
        }

        function loadSettings() {
            const themeToggle = document.getElementById('theme-toggle');
            const sortDropdown = document.getElementById('sort-dropdown');

            const isDarkMode = localStorage.getItem('darkMode') === 'true';
            let settingSortBy = localStorage.getItem('sort_by');
            filterLogicIsAnd = localStorage.getItem('filter_logic_is_and') === 'true';
            
            if (isDarkMode) {
                document.body.classList.remove('light-theme');
                document.body.classList.add('dark-theme');
                themeToggle.checked = true;
                const title = document.getElementById('doomgrad');
                title.innerHTML = "hf nightly";
                const titleSign = document.getElementById('doomgrad-icon');
                titleSign.classList.add('rotate');
            }

            if ((!settingSortBy) || (settingSortBy === 'null')) {
                settingSortBy = 'issue_id';
            }

            if (filterLogicIsAnd) {
                document.getElementById('filter-logic-and').checked = true;
            } else {
                document.getElementById('filter-logic-or').checked = true;
            }

            sortDropdown.value = settingSortBy;
            sortBy = settingSortBy;
        }

        document.getElementById('theme-toggle').addEventListener('change', toggleTheme);
        document.getElementById('filter-logic-and').addEventListener('change', () => {
            filterLogicIsAnd = true;
            localStorage.setItem('filter_logic_is_and', 'true');
            filterAndRenderArticles();
            updateSelectedArticlesTitle();
        });
        document.getElementById('filter-logic-or').addEventListener('change', () => {
            filterLogicIsAnd = false;
            localStorage.setItem('filter_logic_is_and', 'false');
            filterAndRenderArticles();
            updateSelectedArticlesTitle();
        });

        function getUniqueCategories(articles) {
            const categories = new Set();
            articles.forEach(article => {
                if (article.data && article.data.categories) {
                    article.data.categories.forEach(cat => categories.add(cat));
                }
            });
            let res = Array.from(categories);
            res.sort();
            return res;
        }

        function createCategoryButtons() {
            //const categories = getUniqueCategories(articlesData);
            const categories = ['#3d (3)', '#agents', '#agi', '#alignment (2)', '#architecture (10)', '#audio (1)', '#benchmark (3)', '#cv (6)', '#data (3)', '#dataset (3)', '#diffusion (3)', '#ethics', '#games (2)', '#graphs (3)', '#hallucinations', '#healthcare', '#inference (4)', '#interpretability (3)', '#leakage', '#long_context (1)', '#low_resource', '#machine_translation', '#math', '#multilingual', '#multimodal (3)', '#open_source (4)', '#optimization (7)', '#plp', '#rag (2)', '#reasoning (2)', '#rl', '#rlhf', '#robotics (1)', '#science (1)', '#security', '#small_models', '#story_generation', '#survey (1)', '#synthetic (1)', '#training (6)', '#transfer_learning (3)', '#video'];

            categories.forEach(category => {
                let catNameSplitted = category.split(/(\s+)/);
                let catName = catNameSplitted[0];
                const button = document.createElement('span');
                button.textContent = catName;
                button.className = 'category-button';
                if (catNameSplitted.length < 2) {
                    button.classList.add('inactive');
                };
                button.onclick = () => toggleCategory(catName, button);
                categoryFiltersContainer.appendChild(button);
            });
        }

        function toggleCategory(category, button) {
            const index = selectedCategories.indexOf(category);
            if (index === -1) {
                selectedCategories.push(category);
                button.classList.add('active');
            } else {
                selectedCategories.splice(index, 1);
                button.classList.remove('active');
            }         
            filterAndRenderArticles();
            saveCategorySelection();
            updateSelectedArticlesTitle();
            updateUrlWithCategories();
            setFilterOptionsVisibility();
        }

        function saveCategorySelection() {
            localStorage.setItem('selectedCategories', JSON.stringify(selectedCategories));
        }

        function updateSelectedArticlesTitle() {
            if ((selectedArticles.length === articlesData.length) & (selectedCategories.length === 0)) {
                categoryToggle.textContent = `🏷️ ${filterLabel[currentLang]}`;
            } else {
                categoryToggle.textContent = `🏷️ ${filterLabel[currentLang]} (${formatArticlesTitle(selectedArticles.length, currentLang)})`;
            }
        }

        function cleanCategorySelection() {
            localStorage.setItem('selectedCategories', JSON.stringify('[]'));
        }

        function loadCategorySelection() {
            const urlCategories = getUrlParameters();
            if (urlCategories.length > 0) {
                selectedCategories = urlCategories;
                saveCategorySelection();
            } else {
                const savedCategories = localStorage.getItem('selectedCategories');
                if (savedCategories && savedCategories !== '"[]"') {
                    selectedCategories = JSON.parse(savedCategories);                    
                }
            }
            updateCategoryButtonStates();
        }

        function updateCategoryButtonStates() {
            const buttons = categoryFiltersContainer.getElementsByClassName('category-button');
            Array.from(buttons).forEach(button => {
                if (selectedCategories.includes(button.textContent)) {
                    button.classList.add('active');
                } else {
                    button.classList.remove('active');
                }
            });
        }

        function filterAndRenderArticles() {
            console.log(selectedCategories);
            let filteredArticles; 

            if (filterLogicIsAnd) {
                filteredArticles = selectedCategories.length === 0
                    ? articlesData
                    : articlesData.filter(article => 
                        article.data && article.data.categories && 
                        selectedCategories.every(cat => article.data.categories.includes(cat))
                );
            } else {
                filteredArticles = selectedCategories.length === 0
                    ? articlesData
                    : articlesData.filter(article => 
                        article.data && article.data.categories && 
                        article.data.categories.some(cat => selectedCategories.includes(cat))
                    );            
            }

            console.log('filteredArticles', filteredArticles)

            selectedArticles = filteredArticles;
            sortArticles(selectedArticles);
        }

        function clearAllCategories() {
            selectedCategories = [];
            updateCategoryButtonStates();
            filterAndRenderArticles();
            saveCategorySelection();
            updateSelectedArticlesTitle();
            updateUrlWithCategories();
        }

        function renderArticles(articles) {
            if (articles.length > 50) {
                articles = articles.slice(0, 50);
                showLimitHint = true;
            } else {
                showLimitHint = false;
            }
            console.log(articles);
            articlesContainer.innerHTML = '';
            articles.forEach((item, index) => {
                if ("error" in item) {
                    console.log(`Omitting JSON. ${item["raw_data"]}`);
                    return;
                }
                
                let explanation = item["data"][currentLang]["desc"];
                let title = item["data"][currentLang]["title"];

                const cats = item["data"]["categories"].slice(0, 5).join(" ");
                
                let affiliations = ""
                if ('affiliations' in item) {
                    affiliations = item["affiliations"].slice(0, 10).join(", ");
                }

                let pdfImg = "https://hfday.ru/img/title_stub.png"
                if ('pdf_title_img' in item) {
                    pdfImg = 'https://hfday.ru/' + item['pdf_title_img']
                    
                }                

                const articleHTML = `
                    <article class='x${item["hash"]}'>
                        <div class="background-digit">${index + 1}</div>
                        <div class="article-content" onclick="toggleAbstract(${index})">
                            <div class="article-title-cont">
                                <div style="display:table-cell; vertical-align: middle;">
                                    <div class="article-title"><h2>${item['data']['emoji']} ${title}</h2></div>
                                </div>
                            </div>
                            <p class="meta">
                            🔺 ${item['score']}. ${item['title']}</p>
                            <p class="pub-date">${publishedLabel[currentLang]}${item['pub_date_card'][currentLang]}</p>
                            
                            <div class="article-pdf-title-img-cont"><img class="article-pdf-title-img" src="${pdfImg}"/></div>
                            
                            <div id="abstract-${index}" class="abstract">
                                <p>${explanation}</p>
                                <div id="toggle-${index}" class="abstract-toggle">...</div>
                            </div>

                            <div class="links">
                                <a href="${item['url']}" target="_blank">${paperLabel[currentLang]}</a>
                            </div>

                            <div class="affiliations">${affiliations}</div>

                            <div class="tags">${cats}</div>
                        </div>
                    </article>
                `;
                articlesContainer.innerHTML += articleHTML;
            });
        }
        
        function sortArticles() {
            let sortedArticles = [...selectedArticles];
            if (sortBy === 'issue_id') {
                sortedArticles.sort((a, b) => b.issue_id - a.issue_id);
            } else if (sortBy === 'pub_date') {
                sortedArticles.sort((a, b) => b.pub_date.localeCompare(a.pub_date));
            } else {
                sortedArticles.sort((a, b) => b.score - a.score);
            }
            renderArticles(sortedArticles);
            localStorage.setItem('sort_by', sortBy);
        }
        
        sortDropdown.addEventListener('change', (event) => {
            sortBy = event.target.value;
            sortArticles(event.target.value);
        });

        categoryToggle.addEventListener('click', () => {
            categoryFiltersContainer.classList.toggle('expanded');
            setFilterOptionsVisibility();
        });

        clearCategoriesButton.addEventListener('click', () => {
            clearAllCategories();
            setFilterOptionsVisibility();
        });

        function setFilterOptionsVisibility() {
            if (selectedCategories.length > 0) {
                categoryFiltersLogicOptions.style.display = 'inline-block';
            } else {
                categoryFiltersLogicOptions.style.display = 'none';
            }
        } 
        
        function updateTimeDiffs() {
            const timeDiff = document.getElementById('timeDiff');
            timeDiff.innerHTML = '🔄 ' + getTimeDiff('2024-09-27 09:00',lang=currentLang);
        }
        function updateSortingOptions() {
            const sortingLabels = {
                ru: {
                    default: "рейтингу",
                    pub_date: "дате публикации",
                    issue_id: "добавлению на HF"
                },
                en: {
                    default: "rating",
                    pub_date: "publication date",
                    issue_id: "HF addition date"
                },
                zh: {
                    default: "评分",
                    pub_date: "发布日期",
                    issue_id: "HF上传日期"
                }
            };

            const dropdown = document.getElementById('sort-dropdown');
            const options = dropdown.options;

            for (let i = 0; i < options.length; i++) {
                const optionValue = options[i].value;
                console.log(sortingLabels)
                options[i].text = sortingLabels[currentLang][optionValue];
            }
        }
        function updateLocalization() {
            const titleDate = document.getElementById('title-date');
            const prevDate = document.getElementById('prev-date');
            const nextDate = document.getElementById('next-date');
            const topMonth = document.getElementById('top-month-label');
            const topDay = document.getElementById('top-day-label');
            const papersCount = document.getElementById('title-articles-count');
            const sortLabelText = document.getElementById('sort-label-text');
            titleDate.innerHTML = feedDate[currentLang];
            prevDate.innerHTML = feedDatePrev[currentLang];
            nextDate.innerHTML = feedDateNext[currentLang];
            papersCount.innerHTML = formatArticlesTitle(articlesData.length, currentLang);
            sortLabelText.innerHTML = sortLabel[currentLang];
            if (topMonth) {
                topMonth.innerHTML = topMonthLabel[currentLang];
            }  
            if (topDay) {
                topDay.innerHTML = topDayLabel[currentLang];
            }             
            updateSelectedArticlesTitle();
            updateSortingOptions();
        } 
        function hideNextLink(format) {
            if (format === 'monthly') {
                if (isCurrentMonth('2024-09-27 09:00')) {
                    const element = document.getElementById('nav-next');
                    if (element) {    
                        element.style.display = 'none';
                    }
                }
            } else {            
                if (isToday('2024-09-27 09:00')) {
                    const element = document.getElementById('nav-next');
                    if (element) {    
                        element.style.display = 'none';
                    }
                }
            }
        }

        loadSettings();
        createCategoryButtons();
        loadCategorySelection();
        filterAndRenderArticles();
        updateSelectedArticlesTitle();
        updateTimeDiffs();
        hideNextLink('daily'); 
        initializeLanguageFlags();
        updateLocalization();
        setFilterOptionsVisibility();
    </script>
</body>
</html>