d/2024-10-18.json

{
    "date": {
        "ru": "18 октября",
        "en": "October 18",
        "zh": "10月18日"
    },
    "time_utc": "2024-10-18 09:00",
    "weekday": 4,
    "issue_id": 176,
    "home_page_url": "https://huggingface.co/papers?date=2024-10-18",
    "papers": [
        {
            "id": "https://huggingface.co/papers/2410.13720",
            "title": "Movie Gen: A Cast of Media Foundation Models",
            "url": "https://huggingface.co/papers/2410.13720",
            "abstract": "We present Movie Gen, a cast of foundation models that generates high-quality, 1080p HD videos with different aspect ratios and synchronized audio. We also show additional capabilities such as precise instruction-based video editing and generation of personalized videos based on a user's image. Our models set a new state-of-the-art on multiple tasks: text-to-video synthesis, video personalization, video editing, video-to-audio generation, and text-to-audio generation. Our largest video generation model is a 30B parameter transformer trained with a maximum context length of 73K video tokens, corresponding to a generated video of 16 seconds at 16 frames-per-second. We show multiple technical innovations and simplifications on the architecture, latent spaces, training objectives and recipes, data curation, evaluation protocols, parallelization techniques, and inference optimizations that allow us to reap the benefits of scaling pre-training data, model size, and training compute for training large scale media generation models. We hope this paper helps the research community to accelerate progress and innovation in media generation models. All videos from this paper are available at https://go.fb.me/MovieGenResearchVideos.",
            "score": 88,
            "issue_id": 147,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "086b8ff148ce7df3",
            "authors": [
                "Adam Polyak",
                "Amit Zohar",
                "Andrew Brown",
                "Andros Tjandra",
                "Animesh Sinha",
                "Ann Lee",
                "Apoorv Vyas",
                "Bowen Shi",
                "Chih-Yao Ma",
                "Ching-Yao Chuang",
                "David Yan",
                "Dhruv Choudhary",
                "Dingkang Wang",
                "Geet Sethi",
                "Guan Pang",
                "Haoyu Ma",
                "Ishan Misra",
                "Ji Hou",
                "Jialiang Wang",
                "Kiran Jagadeesh",
                "Kunpeng Li",
                "Luxin Zhang",
                "Mannat Singh",
                "Mary Williamson",
                "Matt Le",
                "Matthew Yu",
                "Mitesh Kumar Singh",
                "Peizhao Zhang",
                "Peter Vajda",
                "Quentin Duval",
                "Rohit Girdhar",
                "Roshan Sumbaly",
                "Sai Saketh Rambhatla",
                "Sam Tsai",
                "Samaneh Azadi",
                "Samyak Datta",
                "Sanyuan Chen",
                "Sean Bell",
                "Sharadh Ramaswamy",
                "Shelly Sheynin",
                "Siddharth Bhattacharya",
                "Simran Motwani",
                "Tao Xu",
                "Tianhe Li",
                "Tingbo Hou",
                "Wei-Ning Hsu",
                "Xi Yin",
                "Xiaoliang Dai",
                "Yaniv Taigman",
                "Yaqiao Luo",
                "Yen-Cheng Liu",
                "Yi-Chiao Wu",
                "Yue Zhao",
                "Yuval Kirstain",
                "Zecheng He",
                "Zijian He",
                "Albert Pumarola",
                "Ali Thabet",
                "Artsiom Sanakoyeu",
                "Arun Mallya",
                "Baishan Guo",
                "Boris Araya",
                "Breena Kerr",
                "Carleigh Wood",
                "Ce Liu",
                "Cen Peng",
                "Dimitry Vengertsev",
                "Edgar Schonfeld",
                "Elliot Blanchard",
                "Felix Juefei-Xu",
                "Fraylie Nord",
                "Jeff Liang",
                "John Hoffman",
                "Jonas Kohler",
                "Kaolin Fire",
                "Karthik Sivakumar",
                "Lawrence Chen",
                "Licheng Yu",
                "Luya Gao",
                "Markos Georgopoulos",
                "Rashel Moritz",
                "Sara K. Sampson",
                "Shikai Li",
                "Simone Parmeggiani",
                "Steve Fine",
                "Tara Fowler",
                "Vladan Petrovic",
                "Yuming Du"
            ],
            "affiliations": [
                "Meta"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13720.jpg",
            "data": {
                "categories": [
                    "#diffusion",
                    "#synthetic",
                    "#inference",
                    "#video",
                    "#optimization",
                    "#multimodal",
                    "#data",
                    "#training",
                    "#open_source",
                    "#audio",
                    "#architecture"
                ],
                "emoji": "🎬",
                "ru": {
                    "title": "MovieGen: Революция в генерации мультимедиа",
                    "desc": "MovieGen - это набор фундаментальных моделей, генерирующих высококачественные видео в формате 1080p HD с синхронизированным аудио. Модели устанавливают новый уровень качества в нескольких задачах, включая синтез видео по тексту, персонализацию видео и генерацию аудио. Крупнейшая модель имеет 30 миллиардов параметров и может генерировать 16-секундные видео. Авторы представляют ряд технических инноваций в архитектуре, обучении и оптимизации моделей генерации мультимедиа."
                },
                "en": {
                    "title": "Revolutionizing Video Creation with Movie Gen",
                    "desc": "Movie Gen introduces advanced foundation models capable of generating high-quality videos with synchronized audio, offering new capabilities in video editing and personalization. The models excel in tasks like text-to-video synthesis and video-to-audio generation, setting a new benchmark in the field. With a 30 billion parameter transformer, the system can produce 16-second videos at 16 frames per second, showcasing significant technical innovations. These advancements aim to push forward the research and development of large-scale media generation models."
                },
                "zh": {
                    "title": "Movie Gen：引领高清视频生成新标准",
                    "desc": "这篇论文介绍了一个名为Movie Gen的基础模型集，可以生成高质量的1080p高清视频，并支持不同的宽高比和同步音频。该模型还具备精确的指令视频编辑和基于用户图像生成个性化视频的能力。Movie Gen在多项任务上设立了新的技术标准，包括文本到视频合成、视频个性化、视频编辑、视频到音频生成和文本到音频生成。通过多项技术创新和简化，该模型在架构、潜在空间、训练目标、数据策划等方面取得了突破，推动了大规模媒体生成模型的进步。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13754",
            "title": "MixEval-X: Any-to-Any Evaluations from Real-World Data Mixtures",
            "url": "https://huggingface.co/papers/2410.13754",
            "abstract": "Perceiving and generating diverse modalities are crucial for AI models to effectively learn from and engage with real-world signals, necessitating reliable evaluations for their development. We identify two major issues in current evaluations: (1) inconsistent standards, shaped by different communities with varying protocols and maturity levels; and (2) significant query, grading, and generalization biases. To address these, we introduce MixEval-X, the first any-to-any real-world benchmark designed to optimize and standardize evaluations across input and output modalities. We propose multi-modal benchmark mixture and adaptation-rectification pipelines to reconstruct real-world task distributions, ensuring evaluations generalize effectively to real-world use cases. Extensive meta-evaluations show our approach effectively aligns benchmark samples with real-world task distributions and the model rankings correlate strongly with that of crowd-sourced real-world evaluations (up to 0.98). We provide comprehensive leaderboards to rerank existing models and organizations and offer insights to enhance understanding of multi-modal evaluations and inform future research.",
            "score": 74,
            "issue_id": 148,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "82517ad6fbb54273",
            "authors": [
                "Jinjie Ni",
                "Yifan Song",
                "Deepanway Ghosal",
                "Bo Li",
                "David Junhao Zhang",
                "Xiang Yue",
                "Fuzhao Xue",
                "Zian Zheng",
                "Kaichen Zhang",
                "Mahir Shah",
                "Kabir Jain",
                "Yang You",
                "Michael Shieh"
            ],
            "affiliations": [
                "Carnegie Mellon University",
                "Independent Researcher",
                "Nanyang Technological University",
                "National University of Singapore",
                "Peking University",
                "University of Waterloo"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13754.jpg",
            "data": {
                "categories": [
                    "#benchmark",
                    "#optimization",
                    "#multimodal",
                    "#survey",
                    "#alignment"
                ],
                "emoji": "🎭",
                "ru": {
                    "title": "MixEval-X: Универсальный бенчмарк для оценки многомодальных моделей ИИ",
                    "desc": "Статья представляет MixEval-X - первый многомодальный бенчмарк для оценки моделей ИИ в реальных задачах. Авторы предлагают новые методы для создания репрезентативных наборов тестов, охватывающих различные модальности ввода и вывода. Бенчмарк решает проблемы несогласованности стандартов оценки и различных видов смещений в существующих методах. Результаты показывают высокую корреляцию с оценками краудсорсинга в реальных сценариях использования."
                },
                "en": {
                    "title": "MixEval-X: Bridging the Gap Between AI Benchmarks and Real-World Performance",
                    "desc": "The paper introduces MixEval-X, a benchmark designed to standardize evaluations across different input and output modalities in AI models. It addresses issues of inconsistent evaluation standards and biases in current methods by proposing a multi-modal benchmark mixture and adaptation-rectification pipelines. These pipelines help align benchmark samples with real-world task distributions, ensuring that evaluations are more representative of real-world scenarios. The approach shows strong correlation with real-world evaluations, providing valuable insights for improving multi-modal evaluations and guiding future research."
                },
                "zh": {
                    "title": "MixEval-X：优化多模态评估的全新基准",
                    "desc": "这篇论文讨论了AI模型在处理多种信号时需要可靠的评估方法。当前评估存在标准不一致和偏差问题。为此，作者提出了MixEval-X，一个用于优化和标准化多模态评估的基准。通过这种方法，评估结果更贴近真实世界的任务分布。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.12784",
            "title": "JudgeBench: A Benchmark for Evaluating LLM-based Judges",
            "url": "https://huggingface.co/papers/2410.12784",
            "abstract": "LLM-based judges have emerged as a scalable alternative to human evaluation and are increasingly used to assess, compare, and improve models. However, the reliability of LLM-based judges themselves is rarely scrutinized. As LLMs become more advanced, their responses grow more sophisticated, requiring stronger judges to evaluate them. Existing benchmarks primarily focus on a judge's alignment with human preferences, but often fail to account for more challenging tasks where crowdsourced human preference is a poor indicator of factual and logical correctness. To address this, we propose a novel evaluation framework to objectively evaluate LLM-based judges. Based on this framework, we propose JudgeBench, a benchmark for evaluating LLM-based judges on challenging response pairs spanning knowledge, reasoning, math, and coding. JudgeBench leverages a novel pipeline for converting existing difficult datasets into challenging response pairs with preference labels reflecting objective correctness. Our comprehensive evaluation on a collection of prompted judges, fine-tuned judges, multi-agent judges, and reward models shows that JudgeBench poses a significantly greater challenge than previous benchmarks, with many strong models (e.g., GPT-4o) performing just slightly better than random guessing. Overall, JudgeBench offers a reliable platform for assessing increasingly advanced LLM-based judges. Data and code are available at https://github.com/ScalerLab/JudgeBench .",
            "score": 42,
            "issue_id": 160,
            "pub_date": "2024-10-16",
            "pub_date_card": {
                "ru": "16 октября",
                "en": "October 16",
                "zh": "10月16日"
            },
            "hash": "a81030e9f379736a",
            "authors": [
                "Sijun Tan",
                "Siyuan Zhuang",
                "Kyle Montgomery",
                "William Y. Tang",
                "Alejandro Cuadron",
                "Chenguang Wang",
                "Raluca Ada Popa",
                "Ion Stoica"
            ],
            "affiliations": [
                "UC Berkeley",
                "Washington University in St. Louis"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.12784.jpg",
            "data": {
                "categories": [
                    "#reasoning",
                    "#benchmark",
                    "#math",
                    "#plp",
                    "#data",
                    "#training",
                    "#dataset",
                    "#open_source",
                    "#architecture",
                    "#alignment"
                ],
                "emoji": "⚖️",
                "ru": {
                    "title": "JudgeBench: Новый стандарт для оценки ИИ-судей",
                    "desc": "Статья представляет новую систему оценки судей на основе больших языковых моделей (LLM). Авторы предлагают JudgeBench - набор тестов для оценки LLM-судей в сложных задачах, охватывающих знания, рассуждения, математику и программирование. JudgeBench использует новый подход к созданию сложных пар ответов с метками предпочтений, отражающими объективную правильность. Результаты показывают, что JudgeBench представляет значительно большую сложность, чем предыдущие тесты, с многими сильными моделями, работающими лишь немного лучше, чем случайное угадывание."
                },
                "en": {
                    "title": "JudgeBench: Raising the Bar for LLM Evaluation",
                    "desc": "The paper introduces JudgeBench, a new benchmark designed to evaluate the reliability of LLM-based judges, which are used to assess and improve machine learning models. Unlike existing benchmarks that focus on alignment with human preferences, JudgeBench emphasizes objective correctness in challenging tasks like reasoning and coding. The framework converts difficult datasets into response pairs with preference labels, providing a more rigorous test for LLM-based judges. Results show that even advanced models struggle with JudgeBench, highlighting its effectiveness in assessing the capabilities of these judges."
                },
                "zh": {
                    "title": "JudgeBench：评估大语言模型评判者的新基准",
                    "desc": "这篇论文提出了一种新的评估框架，用于客观地评估基于大语言模型（LLM）的评判者。研究者开发了一个名为JudgeBench的基准，用于评估这些评判者在知识、推理、数学和编程等方面的挑战性响应对。JudgeBench通过一个新颖的流程，将现有的困难数据集转换为具有偏好标签的挑战性响应对，以反映客观的正确性。研究结果表明，JudgeBench比以往的基准更具挑战性，许多强大的模型在此基准上的表现仅略优于随机猜测。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13863",
            "title": "Fluid: Scaling Autoregressive Text-to-image Generative Models with Continuous Tokens",
            "url": "https://huggingface.co/papers/2410.13863",
            "abstract": "Scaling up autoregressive models in vision has not proven as beneficial as in large language models. In this work, we investigate this scaling problem in the context of text-to-image generation, focusing on two critical factors: whether models use discrete or continuous tokens, and whether tokens are generated in a random or fixed raster order using BERT- or GPT-like transformer architectures. Our empirical results show that, while all models scale effectively in terms of validation loss, their evaluation performance -- measured by FID, GenEval score, and visual quality -- follows different trends. Models based on continuous tokens achieve significantly better visual quality than those using discrete tokens. Furthermore, the generation order and attention mechanisms significantly affect the GenEval score: random-order models achieve notably better GenEval scores compared to raster-order models. Inspired by these findings, we train Fluid, a random-order autoregressive model on continuous tokens. Fluid 10.5B model achieves a new state-of-the-art zero-shot FID of 6.16 on MS-COCO 30K, and 0.69 overall score on the GenEval benchmark. We hope our findings and results will encourage future efforts to further bridge the scaling gap between vision and language models.",
            "score": 35,
            "issue_id": 160,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "3fa9a449112a391b",
            "authors": [
                "Lijie Fan",
                "Tianhong Li",
                "Siyang Qin",
                "Yuanzhen Li",
                "Chen Sun",
                "Michael Rubinstein",
                "Deqing Sun",
                "Kaiming He",
                "Yonglong Tian"
            ],
            "affiliations": [
                "Google DeepMind",
                "MIT"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13863.jpg",
            "data": {
                "categories": [
                    "#diffusion",
                    "#benchmark",
                    "#cv",
                    "#optimization",
                    "#games",
                    "#architecture"
                ],
                "emoji": "🖼️",
                "ru": {
                    "title": "Новый подход к масштабированию моделей генерации изображений",
                    "desc": "Исследователи изучили проблему масштабирования авторегрессионных моделей в контексте генерации изображений по тексту. Они сравнили модели с дискретными и непрерывными токенами, а также с различным порядком генерации и архитектурами трансформеров. Результаты показали, что модели с непрерывными токенами достигают лучшего визуального качества, а случайный порядок генерации улучшает показатели GenEval. На основе этих выводов была разработана модель Fluid, достигшая нового уровня производительности в нескольких бенчмарках."
                },
                "en": {
                    "title": "Bridging the Gap: Continuous Tokens and Random Order in Vision Models",
                    "desc": "This paper explores the challenges of scaling autoregressive models for text-to-image generation, focusing on the use of discrete versus continuous tokens and the order of token generation. The study finds that models using continuous tokens produce higher visual quality images compared to those using discrete tokens. Additionally, models that generate tokens in a random order outperform those using a fixed raster order in terms of GenEval scores. The authors introduce Fluid, a random-order autoregressive model with continuous tokens, which sets new benchmarks in zero-shot FID and GenEval scores, suggesting a promising direction for future research in bridging the gap between vision and language models."
                },
                "zh": {
                    "title": "突破视觉与语言模型扩展的界限",
                    "desc": "这篇论文研究了在图像生成中自回归模型的扩展问题，特别关注使用离散或连续的标记，以及标记生成的顺序。研究发现，使用连续标记的模型在视觉质量上明显优于使用离散标记的模型。生成顺序和注意力机制对GenEval评分有显著影响，随机顺序的模型在GenEval评分上表现更好。基于这些发现，作者训练了Fluid模型，在MS-COCO 30K数据集上取得了新的零样本FID记录。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13268",
            "title": "Roadmap towards Superhuman Speech Understanding using Large Language Models",
            "url": "https://huggingface.co/papers/2410.13268",
            "abstract": "The success of large language models (LLMs) has prompted efforts to integrate speech and audio data, aiming to create general foundation models capable of processing both textual and non-textual inputs. Recent advances, such as GPT-4o, highlight the potential for end-to-end speech LLMs, which preserves non-semantic information and world knowledge for deeper speech understanding. To guide the development of speech LLMs, we propose a five-level roadmap, ranging from basic automatic speech recognition (ASR) to advanced superhuman models capable of integrating non-semantic information with abstract acoustic knowledge for complex tasks. Moreover, we design a benchmark, SAGI Bechmark, that standardizes critical aspects across various tasks in these five levels, uncovering challenges in using abstract acoustic knowledge and completeness of capability. Our findings reveal gaps in handling paralinguistic cues and abstract acoustic knowledge, and we offer future directions. This paper outlines a roadmap for advancing speech LLMs, introduces a benchmark for evaluation, and provides key insights into their current limitations and potential.",
            "score": 33,
            "issue_id": 153,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "929ec80dcb105705",
            "authors": [
                "Fan Bu",
                "Yuhao Zhang",
                "Xidong Wang",
                "Benyou Wang",
                "Qun Liu",
                "Haizhou Li"
            ],
            "affiliations": [
                "Noahs Ark Lab, Huawei",
                "The Chinese University of Hong Kong, Shenzhen"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13268.jpg",
            "data": {
                "categories": [
                    "#science",
                    "#benchmark",
                    "#agi",
                    "#multimodal",
                    "#survey",
                    "#audio",
                    "#architecture"
                ],
                "emoji": "🗣️",
                "ru": {
                    "title": "Дорожная карта для речевых LLM: от распознавания речи к сверхчеловеческим моделям",
                    "desc": "В статье предлагается дорожная карта из пяти уровней для развития речевых языковых моделей (LLM), способных обрабатывать как текстовые, так и нетекстовые входные данные. Авторы разработали бенчмарк SAGI для стандартизации оценки различных задач на этих уровнях. Исследование выявило пробелы в обработке паралингвистических сигналов и абстрактных акустических знаний. Статья предлагает направления для будущих исследований в области речевых LLM."
                },
                "en": {
                    "title": "Bridging Text and Sound: The Future of Speech Language Models",
                    "desc": "This paper explores the integration of speech and audio data into large language models (LLMs) to create versatile models that can handle both text and non-text inputs. It introduces a five-level roadmap for developing speech LLMs, from basic automatic speech recognition (ASR) to advanced models that incorporate non-semantic information and abstract acoustic knowledge. The authors also present the SAGI Benchmark, which evaluates these models across various tasks and highlights challenges in processing paralinguistic cues and abstract acoustic knowledge. The paper provides insights into the current limitations of speech LLMs and suggests future research directions to enhance their capabilities."
                },
                "zh": {
                    "title": "语音大模型的未来：从基础到超人",
                    "desc": "这篇论文探讨了将语音和音频数据整合到大型语言模型中的可能性，旨在创建能够处理文本和非文本输入的通用基础模型。研究提出了一个五级路线图，从基本的自动语音识别到能够处理复杂任务的超人模型。作者还设计了一个名为SAGI的基准，用于标准化这些五个级别中各种任务的关键方面。研究发现当前模型在处理副语言线索和抽象声学知识方面存在不足，并提供了未来的发展方向。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13757",
            "title": "MobA: A Two-Level Agent System for Efficient Mobile Task Automation",
            "url": "https://huggingface.co/papers/2410.13757",
            "abstract": "Current mobile assistants are limited by dependence on system APIs or struggle with complex user instructions and diverse interfaces due to restricted comprehension and decision-making abilities. To address these challenges, we propose MobA, a novel Mobile phone Agent powered by multimodal large language models that enhances comprehension and planning capabilities through a sophisticated two-level agent architecture. The high-level Global Agent (GA) is responsible for understanding user commands, tracking history memories, and planning tasks. The low-level Local Agent (LA) predicts detailed actions in the form of function calls, guided by sub-tasks and memory from the GA. Integrating a Reflection Module allows for efficient task completion and enables the system to handle previously unseen complex tasks. MobA demonstrates significant improvements in task execution efficiency and completion rate in real-life evaluations, underscoring the potential of MLLM-empowered mobile assistants.",
            "score": 31,
            "issue_id": 150,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "a4a73fb090d1a0ae",
            "authors": [
                "Zichen Zhu",
                "Hao Tang",
                "Yansi Li",
                "Kunyao Lan",
                "Yixuan Jiang",
                "Hao Zhou",
                "Yixiao Wang",
                "Situo Zhang",
                "Liangtai Sun",
                "Lu Chen",
                "Kai Yu"
            ],
            "affiliations": [
                "Shanghai Jiao Tong University, China"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13757.jpg",
            "data": {
                "categories": [
                    "#reasoning",
                    "#agi",
                    "#multimodal",
                    "#agents",
                    "#architecture",
                    "#alignment"
                ],
                "emoji": "📱",
                "ru": {
                    "title": "MobA: Умный мобильный помощник нового поколения",
                    "desc": "MobA - это новый мобильный агент, основанный на мультимодальных больших языковых моделях. Он использует двухуровневую архитектуру с глобальным агентом для понимания команд и планирования, и локальным агентом для выполнения конкретных действий. Система включает модуль рефлексии для эффективного выполнения задач и обработки новых сложных заданий. MobA показывает значительное улучшение эффективности и уровня выполнения задач в реальных условиях."
                },
                "en": {
                    "title": "Revolutionizing Mobile Assistance with Multimodal Intelligence",
                    "desc": "The paper introduces MobA, a mobile assistant that uses multimodal large language models to improve understanding and task planning. It features a two-level agent architecture with a Global Agent for command comprehension and task planning, and a Local Agent for executing detailed actions. A Reflection Module is integrated to enhance the system's ability to handle complex and novel tasks. Real-life tests show MobA's improved efficiency and success in completing tasks, highlighting the potential of advanced language models in mobile assistants."
                },
                "zh": {
                    "title": "多模态大语言模型助力移动助手新突破",
                    "desc": "当前的移动助手由于对系统API的依赖和对复杂用户指令的理解能力有限，难以处理多样化的界面。为了解决这些问题，我们提出了MobA，这是一种由多模态大语言模型驱动的移动代理，通过复杂的双层代理架构增强理解和规划能力。高层的全局代理负责理解用户命令、跟踪历史记忆和规划任务，而低层的本地代理则根据子任务和全局代理的记忆预测详细的动作。通过集成反思模块，系统能够高效完成任务，并处理以前未见过的复杂任务。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.12705",
            "title": "WorldCuisines: A Massive-Scale Benchmark for Multilingual and Multicultural Visual Question Answering on Global Cuisines",
            "url": "https://huggingface.co/papers/2410.12705",
            "abstract": "Vision Language Models (VLMs) often struggle with culture-specific knowledge, particularly in languages other than English and in underrepresented cultural contexts. To evaluate their understanding of such knowledge, we introduce WorldCuisines, a massive-scale benchmark for multilingual and multicultural, visually grounded language understanding. This benchmark includes a visual question answering (VQA) dataset with text-image pairs across 30 languages and dialects, spanning 9 language families and featuring over 1 million data points, making it the largest multicultural VQA benchmark to date. It includes tasks for identifying dish names and their origins. We provide evaluation datasets in two sizes (12k and 60k instances) alongside a training dataset (1 million instances). Our findings show that while VLMs perform better with correct location context, they struggle with adversarial contexts and predicting specific regional cuisines and languages. To support future research, we release a knowledge base with annotated food entries and images along with the VQA data.",
            "score": 29,
            "issue_id": 157,
            "pub_date": "2024-10-16",
            "pub_date_card": {
                "ru": "16 октября",
                "en": "October 16",
                "zh": "10月16日"
            },
            "hash": "6829d8490ef2d294",
            "authors": [
                "Genta Indra Winata",
                "Frederikus Hudi",
                "Patrick Amadeus Irawan",
                "David Anugraha",
                "Rifki Afina Putri",
                "Yutong Wang",
                "Adam Nohejl",
                "Ubaidillah Ariq Prathama",
                "Nedjma Ousidhoum",
                "Afifa Amriani",
                "Anar Rzayev",
                "Anirban Das",
                "Ashmari Pramodya",
                "Aulia Adila",
                "Bryan Wilie",
                "Candy Olivia Mawalim",
                "Ching Lam Cheng",
                "Daud Abolade",
                "Emmanuele Chersoni",
                "Enrico Santus",
                "Fariz Ikhwantri",
                "Garry Kuwanto",
                "Hanyang Zhao",
                "Haryo Akbarianto Wibowo",
                "Holy Lovenia",
                "Jan Christian Blaise Cruz",
                "Jan Wira Gotama Putra",
                "Junho Myung",
                "Lucky Susanto",
                "Maria Angelica Riera Machin",
                "Marina Zhukova",
                "Michael Anugraha",
                "Muhammad Farid Adilazuarda",
                "Natasha Santosa",
                "Peerat Limkonchotiwat",
                "Raj Dabre",
                "Rio Alexander Audino",
                "Samuel Cahyawijaya",
                "Shi-Xiong Zhang",
                "Stephanie Yulia Salim",
                "Yi Zhou",
                "Yinxuan Gui",
                "David Ifeoluwa Adelani",
                "En-Shiun Annie Lee",
                "Shogo Okada",
                "Ayu Purwarianti",
                "Alham Fikri Aji",
                "Taro Watanabe",
                "Derry Tanti Wijaya",
                "Alice Oh",
                "Chong-Wah Ngo"
            ],
            "affiliations": [
                "AI Singapore",
                "Boston University",
                "Capital One",
                "Cardiff University",
                "Cohere",
                "Columbia University",
                "HK PolyU",
                "HKUST",
                "ITB",
                "Independent",
                "JAIST",
                "KAIST",
                "MBZUAI",
                "MILA",
                "Masakhane",
                "McGill",
                "Monash University",
                "NAIST",
                "NICT",
                "Ontario Tech",
                "SEACrowd",
                "SMU",
                "Tokyo Tech",
                "UCSB",
                "University of Lagos",
                "UofT"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.12705.jpg",
            "data": {
                "categories": [
                    "#benchmark",
                    "#multilingual",
                    "#cv",
                    "#graphs",
                    "#multimodal",
                    "#dataset",
                    "#open_source",
                    "#games",
                    "#low_resource"
                ],
                "emoji": "🌎",
                "ru": {
                    "title": "WorldCuisines: Глобальный тест на кулинарную эрудицию для ИИ",
                    "desc": "Статья представляет новый бенчмарк WorldCuisines для оценки понимания культурно-специфических знаний моделями компьютерного зрения и обработки естественного языка. Бенчмарк включает набор данных для визуального ответа на вопросы (VQA) с парами текст-изображение на 30 языках и диалектах, охватывающих 9 языковых семей. Авторы предоставляют наборы данных для оценки в двух размерах, а также тренировочный набор из 1 миллиона примеров. Результаты показывают, что модели лучше справляются с правильным контекстом местоположения, но испытывают трудности с состязательными контекстами и прогнозированием конкретных региональных кухонь и языков."
                },
                "en": {
                    "title": "\"WorldCuisines: Bridging Cultural Gaps in Vision Language Models\"",
                    "desc": "The paper introduces WorldCuisines, a large-scale benchmark designed to test Vision Language Models (VLMs) on their ability to understand culture-specific knowledge across multiple languages and dialects. This benchmark includes a Visual Question Answering (VQA) dataset with over 1 million text-image pairs, making it the largest of its kind for multicultural contexts. The study reveals that while VLMs can perform well when given correct location context, they face challenges with adversarial contexts and accurately predicting regional cuisines and languages. To aid further research, the authors provide a comprehensive knowledge base with annotated food entries and images."
                },
                "zh": {
                    "title": "跨文化视觉语言理解的新基准",
                    "desc": "这篇论文介绍了一个名为WorldCuisines的大规模基准，用于评估视觉语言模型在多语言和多文化背景下的理解能力。该基准包括一个视觉问答数据集，涵盖30种语言和方言，涉及9个语言家族，拥有超过100万个数据点。研究发现，视觉语言模型在正确的地理背景下表现较好，但在对抗性背景和预测特定地区的菜肴和语言时表现较差。为了支持未来的研究，作者还发布了一个包含注释食品条目和图像的知识库。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13824",
            "title": "Harnessing Webpage UIs for Text-Rich Visual Understanding",
            "url": "https://huggingface.co/papers/2410.13824",
            "abstract": "Text-rich visual understanding-the ability to process environments where dense textual content is integrated with visuals-is crucial for multimodal large language models (MLLMs) to interact effectively with structured environments. To enhance this capability, we propose synthesizing general multimodal instructions from webpage UIs using text-based large language models (LLMs). Despite lacking direct visual input, text-based LLMs are able to process structured text representations from webpage accessibility trees. These instructions are then paired with UI screenshots to train multimodal models. We introduce MultiUI, a dataset containing 7.3 million samples from 1 million websites, covering diverse multimodal tasks and UI layouts. Models trained on MultiUI not only excel in web UI tasks-achieving up to a 48\\% improvement on VisualWebBench and a 19.1\\% boost in action accuracy on a web agent dataset Mind2Web-but also generalize surprisingly well to non-web UI tasks and even to non-UI domains, such as document understanding, OCR, and chart interpretation. These results highlight the broad applicability of web UI data for advancing text-rich visual understanding across various scenarios.",
            "score": 29,
            "issue_id": 146,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "7d1ade016ff53a03",
            "authors": [
                "Junpeng Liu",
                "Tianyue Ou",
                "Yifan Song",
                "Yuxiao Qu",
                "Wai Lam",
                "Chenyan Xiong",
                "Wenhu Chen",
                "Graham Neubig",
                "Xiang Yue"
            ],
            "affiliations": [
                "Carnegie Mellon University",
                "Peking University",
                "The Chinese University of Hong Kong",
                "University of Waterloo"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13824.jpg",
            "data": {
                "categories": [
                    "#science",
                    "#synthetic",
                    "#benchmark",
                    "#cv",
                    "#graphs",
                    "#optimization",
                    "#multimodal",
                    "#data",
                    "#training",
                    "#dataset",
                    "#transfer_learning",
                    "#games",
                    "#architecture"
                ],
                "emoji": "🌐",
                "ru": {
                    "title": "Синтез веб-данных для универсального мультимодального понимания",
                    "desc": "В статье представлен новый подход к улучшению понимания визуального контекста с текстом для мультимодальных больших языковых моделей. Авторы предлагают синтезировать инструкции из веб-интерфейсов с помощью текстовых языковых моделей. Создан датасет MultiUI, содержащий 7,3 миллиона образцов из 1 миллиона веб-сайтов. Модели, обученные на MultiUI, показывают значительное улучшение в задачах веб-интерфейсов и обобщают свои способности на другие домены."
                },
                "en": {
                    "title": "Unlocking Text-Rich Visual Understanding with Web UI Data",
                    "desc": "The paper introduces a method to improve multimodal large language models (MLLMs) by synthesizing instructions from webpage UIs using text-based large language models (LLMs). These models, despite not having direct visual input, can process structured text from webpage accessibility trees and are trained with UI screenshots. The authors present MultiUI, a dataset with 7.3 million samples from 1 million websites, which helps models excel in web UI tasks and generalize to other domains like document understanding and OCR. The study demonstrates that web UI data can significantly enhance text-rich visual understanding across various applications."
                },
                "zh": {
                    "title": "网页UI数据：提升多模态视觉理解的关键",
                    "desc": "这篇论文提出了一种方法，通过网页的可访问性树生成多模态指令，来增强多模态大语言模型的文本丰富视觉理解能力。研究中使用了一个名为MultiUI的数据集，包含了来自100万个网站的730万样本，用于训练多模态模型。实验结果表明，使用MultiUI训练的模型在网页UI任务中表现优异，并且在非网页UI任务和其他领域如文档理解、OCR和图表解释中也有良好的泛化能力。这表明网页UI数据在提升多种场景下的文本丰富视觉理解方面具有广泛的应用潜力。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13848",
            "title": "Janus: Decoupling Visual Encoding for Unified Multimodal Understanding and Generation",
            "url": "https://huggingface.co/papers/2410.13848",
            "abstract": "In this paper, we introduce Janus, an autoregressive framework that unifies multimodal understanding and generation. Prior research often relies on a single visual encoder for both tasks, such as Chameleon. However, due to the differing levels of information granularity required by multimodal understanding and generation, this approach can lead to suboptimal performance, particularly in multimodal understanding. To address this issue, we decouple visual encoding into separate pathways, while still leveraging a single, unified transformer architecture for processing. The decoupling not only alleviates the conflict between the visual encoder's roles in understanding and generation, but also enhances the framework's flexibility. For instance, both the multimodal understanding and generation components can independently select their most suitable encoding methods. Experiments show that Janus surpasses previous unified model and matches or exceeds the performance of task-specific models. The simplicity, high flexibility, and effectiveness of Janus make it a strong candidate for next-generation unified multimodal models.",
            "score": 28,
            "issue_id": 148,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "8b28045f373976ba",
            "authors": [
                "Chengyue Wu",
                "Xiaokang Chen",
                "Zhiyu Wu",
                "Yiyang Ma",
                "Xingchao Liu",
                "Zizheng Pan",
                "Wen Liu",
                "Zhenda Xie",
                "Xingkai Yu",
                "Chong Ruan",
                "Ping Luo"
            ],
            "affiliations": [
                "DeepSeek-AI",
                "Peking University",
                "The University of Hong Kong"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13848.jpg",
            "data": {
                "categories": [
                    "#optimization",
                    "#multimodal",
                    "#interpretability",
                    "#games",
                    "#architecture"
                ],
                "emoji": "🔀",
                "ru": {
                    "title": "Janus: единая модель для мультимодального понимания и генерации",
                    "desc": "Статья представляет Janus - новую авторегрессивную модель для мультимодального понимания и генерации. В отличие от предыдущих подходов, Janus использует отдельные визуальные энкодеры для задач понимания и генерации, что позволяет оптимизировать работу модели. Единая архитектура трансформера обрабатывает данные от обоих энкодеров. Эксперименты показывают, что Janus превосходит предыдущие унифицированные модели и не уступает специализированным моделям для конкретных задач."
                },
                "en": {
                    "title": "Janus: A New Era in Multimodal Intelligence",
                    "desc": "The paper introduces Janus, a new framework that improves how machines understand and create content using different types of data, like images and text. Unlike previous models that used one visual encoder for both understanding and generating, Janus separates these tasks into different pathways, which helps improve performance. By using a single transformer architecture, Janus allows each task to choose the best way to process information, making it more flexible and effective. Experiments show that Janus not only outperforms previous models but also competes well with models designed for specific tasks."
                },
                "zh": {
                    "title": "Janus：多模态理解与生成的全新统一框架",
                    "desc": "这篇论文介绍了Janus，一个统一多模态理解和生成的自回归框架。以往的研究通常使用单一的视觉编码器来处理这两项任务，但由于多模态理解和生成所需的信息粒度不同，这种方法可能导致性能不佳。为了解决这个问题，Janus将视觉编码解耦为独立的路径，同时仍然使用统一的Transformer架构进行处理。实验表明，Janus不仅超越了之前的统一模型，还能匹敌或超过特定任务模型的表现。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13830",
            "title": "DreamVideo-2: Zero-Shot Subject-Driven Video Customization with Precise Motion Control",
            "url": "https://huggingface.co/papers/2410.13830",
            "abstract": "Recent advances in customized video generation have enabled users to create videos tailored to both specific subjects and motion trajectories. However, existing methods often require complicated test-time fine-tuning and struggle with balancing subject learning and motion control, limiting their real-world applications. In this paper, we present DreamVideo-2, a zero-shot video customization framework capable of generating videos with a specific subject and motion trajectory, guided by a single image and a bounding box sequence, respectively, and without the need for test-time fine-tuning. Specifically, we introduce reference attention, which leverages the model's inherent capabilities for subject learning, and devise a mask-guided motion module to achieve precise motion control by fully utilizing the robust motion signal of box masks derived from bounding boxes. While these two components achieve their intended functions, we empirically observe that motion control tends to dominate over subject learning. To address this, we propose two key designs: 1) the masked reference attention, which integrates a blended latent mask modeling scheme into reference attention to enhance subject representations at the desired positions, and 2) a reweighted diffusion loss, which differentiates the contributions of regions inside and outside the bounding boxes to ensure a balance between subject and motion control. Extensive experimental results on a newly curated dataset demonstrate that DreamVideo-2 outperforms state-of-the-art methods in both subject customization and motion control. The dataset, code, and models will be made publicly available.",
            "score": 23,
            "issue_id": 146,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "67dc892195cd59d6",
            "authors": [
                "Yujie Wei",
                "Shiwei Zhang",
                "Hangjie Yuan",
                "Xiang Wang",
                "Haonan Qiu",
                "Rui Zhao",
                "Yutong Feng",
                "Feng Liu",
                "Zhizhong Huang",
                "Jiaxin Ye",
                "Yingya Zhang",
                "Hongming Shan"
            ],
            "affiliations": [
                "Alibaba Group",
                "Fudan University",
                "Michigan State University",
                "Nanyang Technological University"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13830.jpg",
            "data": {
                "categories": [
                    "#diffusion",
                    "#video",
                    "#training",
                    "#dataset",
                    "#open_source",
                    "#games",
                    "#architecture"
                ],
                "emoji": "🎬",
                "ru": {
                    "title": "Создание персонализированных видео одним щелчком",
                    "desc": "DreamVideo-2 - это новая система для создания персонализированных видео без дополнительного обучения. Она использует одно изображение и последовательность ограничивающих рамок для генерации видео с заданным объектом и траекторией движения. Система вводит референсное внимание и маскированный модуль движения для балансировки между сохранением объекта и контролем движения. Эксперименты показывают превосходство DreamVideo-2 над современными методами в области персонализации объектов и контроля движения."
                },
                "en": {
                    "title": "Effortless Video Customization with DreamVideo-2",
                    "desc": "DreamVideo-2 is a new framework for creating customized videos without needing complex adjustments during testing. It uses a single image and a sequence of bounding boxes to guide video generation, focusing on both the subject and its motion. The framework introduces reference attention and a mask-guided motion module to improve subject learning and motion control. To balance these aspects, it employs masked reference attention and reweighted diffusion loss, achieving superior results compared to existing methods."
                },
                "zh": {
                    "title": "DreamVideo-2：无微调的视频定制新突破",
                    "desc": "这篇论文介绍了一种名为DreamVideo-2的视频定制框架，可以在不需要测试时微调的情况下生成特定主题和运动轨迹的视频。该方法通过引入参考注意力和掩码引导运动模块，实现了对主题学习和运动控制的平衡。研究发现，运动控制往往会压倒主题学习，因此提出了掩码参考注意力和重加权扩散损失来解决这一问题。实验结果表明，DreamVideo-2在主题定制和运动控制方面优于现有方法。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.11842",
            "title": "MoH: Multi-Head Attention as Mixture-of-Head Attention",
            "url": "https://huggingface.co/papers/2410.11842",
            "abstract": "In this work, we upgrade the multi-head attention mechanism, the core of the Transformer model, to improve efficiency while maintaining or surpassing the previous accuracy level. We show that multi-head attention can be expressed in the summation form. Drawing on the insight that not all attention heads hold equal significance, we propose Mixture-of-Head attention (MoH), a new architecture that treats attention heads as experts in the Mixture-of-Experts (MoE) mechanism. MoH has two significant advantages: First, MoH enables each token to select the appropriate attention heads, enhancing inference efficiency without compromising accuracy or increasing the number of parameters. Second, MoH replaces the standard summation in multi-head attention with a weighted summation, introducing flexibility to the attention mechanism and unlocking extra performance potential. Extensive experiments on ViT, DiT, and LLMs demonstrate that MoH outperforms multi-head attention by using only 50%-90% of the attention heads. Moreover, we demonstrate that pre-trained multi-head attention models, such as LLaMA3-8B, can be further continue-tuned into our MoH models. Notably, MoH-LLaMA3-8B achieves an average accuracy of 64.0% across 14 benchmarks, outperforming LLaMA3-8B by 2.4% by utilizing only 75% of the attention heads. We believe the proposed MoH is a promising alternative to multi-head attention and provides a strong foundation for developing advanced and efficient attention-based models.",
            "score": 20,
            "issue_id": 148,
            "pub_date": "2024-10-15",
            "pub_date_card": {
                "ru": "15 октября",
                "en": "October 15",
                "zh": "10月15日"
            },
            "hash": "4a94e557d3f7a79a",
            "authors": [
                "Peng Jin",
                "Bo Zhu",
                "Li Yuan",
                "Shuicheng Yan"
            ],
            "affiliations": [
                "Kunlun 2050 Research & Skywork AI, Singapore",
                "Peng Cheng Laboratory, Shenzhen, China",
                "Rabbitpre Intelligence, Shenzhen, China",
                "School of Electronic and Computer Engineering, Peking University, Shenzhen, China"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.11842.jpg",
            "data": {
                "categories": [
                    "#small_models",
                    "#inference",
                    "#optimization",
                    "#training",
                    "#transfer_learning",
                    "#architecture"
                ],
                "emoji": "🧠",
                "ru": {
                    "title": "Смешивание голов внимания для повышения эффективности трансформеров",
                    "desc": "Исследователи предлагают новый механизм внимания Mixture-of-Head (MoH), который улучшает эффективность многоголового внимания в трансформерах. MoH позволяет каждому токену выбирать подходящие головы внимания, повышая эффективность вывода без ущерба для точности. Эксперименты на ViT, DiT и языковых моделях показывают, что MoH превосходит стандартное многоголовое внимание, используя лишь 50-90% голов. Продолжительная настройка предобученных моделей, таких как LLaMA3-8B, с использованием MoH также демонстрирует значительное улучшение производительности."
                },
                "en": {
                    "title": "\"MoH: Elevating Attention Efficiency and Accuracy\"",
                    "desc": "This paper introduces Mixture-of-Head attention (MoH), an enhancement to the multi-head attention mechanism in Transformer models, aimed at improving efficiency and accuracy. MoH treats attention heads as experts, allowing each token to select the most relevant heads, which boosts inference efficiency without increasing parameters. By replacing the standard summation with a weighted summation, MoH adds flexibility and unlocks additional performance potential. Experiments show that MoH outperforms traditional multi-head attention, achieving higher accuracy with fewer attention heads, and can be applied to pre-trained models like LLaMA3-8B for further improvements."
                },
                "zh": {
                    "title": "混合头注意力：高效的Transformer新选择",
                    "desc": "这项研究改进了Transformer模型中的多头注意力机制，提高了效率，同时保持或超过了之前的准确性。研究表明，多头注意力可以用求和形式表示，并提出了混合头注意力（MoH）架构，将注意力头视为专家。MoH允许每个标记选择合适的注意力头，提高推理效率而不增加参数数量。实验表明，MoH在使用较少注意力头的情况下，性能优于传统多头注意力。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13085",
            "title": "MMed-RAG: Versatile Multimodal RAG System for Medical Vision Language Models",
            "url": "https://huggingface.co/papers/2410.13085",
            "abstract": "Artificial Intelligence (AI) has demonstrated significant potential in healthcare, particularly in disease diagnosis and treatment planning. Recent progress in Medical Large Vision-Language Models (Med-LVLMs) has opened up new possibilities for interactive diagnostic tools. However, these models often suffer from factual hallucination, which can lead to incorrect diagnoses. Fine-tuning and retrieval-augmented generation (RAG) have emerged as methods to address these issues. However, the amount of high-quality data and distribution shifts between training data and deployment data limit the application of fine-tuning methods. Although RAG is lightweight and effective, existing RAG-based approaches are not sufficiently general to different medical domains and can potentially cause misalignment issues, both between modalities and between the model and the ground truth. In this paper, we propose a versatile multimodal RAG system, MMed-RAG, designed to enhance the factuality of Med-LVLMs. Our approach introduces a domain-aware retrieval mechanism, an adaptive retrieved contexts selection method, and a provable RAG-based preference fine-tuning strategy. These innovations make the RAG process sufficiently general and reliable, significantly improving alignment when introducing retrieved contexts. Experimental results across five medical datasets (involving radiology, ophthalmology, pathology) on medical VQA and report generation demonstrate that MMed-RAG can achieve an average improvement of 43.8% in the factual accuracy of Med-LVLMs. Our data and code are available in https://github.com/richard-peng-xia/MMed-RAG.",
            "score": 20,
            "issue_id": 146,
            "pub_date": "2024-10-16",
            "pub_date_card": {
                "ru": "16 октября",
                "en": "October 16",
                "zh": "10月16日"
            },
            "hash": "8ef96c4ea4d54ffd",
            "authors": [
                "Peng Xia",
                "Kangyu Zhu",
                "Haoran Li",
                "Tianze Wang",
                "Weijia Shi",
                "Sheng Wang",
                "Linjun Zhang",
                "James Zou",
                "Huaxiu Yao"
            ],
            "affiliations": [
                "Brown University",
                "PloyU",
                "Rutgers University",
                "Stanford University",
                "UNC-Chapel Hill",
                "University of Washington"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13085.jpg",
            "data": {
                "categories": [
                    "#rag",
                    "#hallucinations",
                    "#benchmark",
                    "#cv",
                    "#multimodal",
                    "#healthcare",
                    "#data",
                    "#training",
                    "#dataset",
                    "#open_source",
                    "#alignment"
                ],
                "emoji": "🏥",
                "ru": {
                    "title": "MMed-RAG: Повышение точности медицинских AI-диагнозов",
                    "desc": "В статье представлена система MMed-RAG, разработанная для повышения фактической точности медицинских крупномасштабных моделей зрения и языка (Med-LVLMs). Система использует механизм поиска с учетом домена, адаптивный метод выбора найденных контекстов и стратегию дообучения на основе RAG. Эксперименты на пяти медицинских наборах данных показали среднее улучшение фактической точности Med-LVLMs на 43.8%. MMed-RAG решает проблемы галлюцинаций и несоответствий, часто встречающихся в существующих моделях."
                },
                "en": {
                    "title": "Enhancing Medical AI: MMed-RAG Boosts Diagnostic Accuracy",
                    "desc": "The paper discusses the development of MMed-RAG, a new system designed to improve the accuracy of Medical Large Vision-Language Models (Med-LVLMs) by addressing issues of factual hallucination. MMed-RAG uses a domain-aware retrieval mechanism and an adaptive context selection method to enhance the reliability of retrieval-augmented generation (RAG) processes. This approach ensures better alignment between the model's outputs and the ground truth across various medical domains. Experimental results show that MMed-RAG significantly boosts the factual accuracy of Med-LVLMs by 43.8% on average across multiple medical datasets."
                },
                "zh": {
                    "title": "MMed-RAG：提升医学AI模型准确性的多模态解决方案",
                    "desc": "这篇论文介绍了一种新的多模态检索增强生成系统，称为MMed-RAG，旨在提高医学大规模视觉语言模型的准确性。该系统通过引入领域感知的检索机制、自适应的检索上下文选择方法，以及可证明的偏好微调策略来增强模型的可靠性。实验结果表明，MMed-RAG在五个医学数据集上的表现显著提高了43.8%的事实准确性。此研究为医学诊断工具的开发提供了新的思路和方法。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13804",
            "title": "BenTo: Benchmark Task Reduction with In-Context Transferability",
            "url": "https://huggingface.co/papers/2410.13804",
            "abstract": "Evaluating large language models (LLMs) is costly: it requires the generation and examination of LLM outputs on a large-scale benchmark of various tasks. This paper investigates how to efficiently reduce the tasks used to benchmark LLMs without affecting the evaluation quality. Our study reveals that task transferability and relevance provide critical information to identify the most representative subset of tasks via optimizing a facility location function. We propose a practically efficient metric for estimating the transferability between two tasks via in-context learning (ICL). By analyzing the pairwise transferability, we can reduce tasks in a modern LLM benchmark (e.g., MMLU or FLAN) to 5% while inducing only a <4% difference to the evaluation on the original benchmark. Compared to prior works, our method is training-free, gradient-free, and highly efficient requiring ICL only.",
            "score": 19,
            "issue_id": 147,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "e8177fd577296e7e",
            "authors": [
                "Hongyu Zhao",
                "Ming Li",
                "Lichao Sun",
                "Tianyi Zhou"
            ],
            "affiliations": [
                "Lehigh University",
                "University of Maryland, College Park"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13804.jpg",
            "data": {
                "categories": [
                    "#optimization",
                    "#training",
                    "#transfer_learning",
                    "#benchmark"
                ],
                "emoji": "🎯",
                "ru": {
                    "title": "Эффективная оценка языковых моделей: меньше задач, та же точность",
                    "desc": "Статья исследует методы эффективного сокращения количества задач для оценки больших языковых моделей (LLM) без ущерба для качества оценки. Авторы предлагают метрику для оценки переносимости между задачами с помощью обучения в контексте (ICL). Анализируя попарную переносимость, можно сократить набор задач в современных бенчмарках LLM до 5% с разницей менее 4% по сравнению с оценкой на полном наборе. Метод не требует дополнительного обучения и градиентов, что делает его высокоэффективным."
                },
                "en": {
                    "title": "Efficient LLM Evaluation: Less is More",
                    "desc": "This paper explores a method to make evaluating large language models (LLMs) more efficient by reducing the number of tasks needed for benchmarking. It introduces a way to identify the most important tasks using task transferability and relevance, optimizing a facility location function. The authors propose a metric to estimate how well tasks transfer to each other using in-context learning, which helps in selecting a smaller set of tasks. Their approach can cut down the tasks in benchmarks like MMLU or FLAN to just 5% of the original, with minimal impact on evaluation quality, and it doesn't require any training or gradients."
                },
                "zh": {
                    "title": "高效评估：减少任务，保持质量",
                    "desc": "这篇论文研究如何在不影响评估质量的情况下，减少用于评估大型语言模型的任务数量。研究表明，任务的可转移性和相关性是识别最具代表性任务子集的关键。通过优化设施位置函数，我们提出了一种高效的指标来估计任务之间的可转移性。分析任务间的可转移性，可以将现代LLM基准中的任务减少到5%，而评估结果仅有不到4%的差异。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13785",
            "title": "PopAlign: Diversifying Contrasting Patterns for a More Comprehensive Alignment",
            "url": "https://huggingface.co/papers/2410.13785",
            "abstract": "Alignment of large language models (LLMs) involves training models on preference-contrastive output pairs to adjust their responses according to human preferences. To obtain such contrastive pairs, traditional methods like RLHF and RLAIF rely on limited contrasting patterns, such as varying model variants or decoding temperatures. This singularity leads to two issues: (1) alignment is not comprehensive; and thereby (2) models are susceptible to jailbreaking attacks. To address these issues, we investigate how to construct more comprehensive and diversified contrasting patterns to enhance preference data (RQ1) and verify the impact of the diversification of contrasting patterns on model alignment (RQ2). For RQ1, we propose PopAlign, a framework that integrates diversified contrasting patterns across the prompt, model, and pipeline levels, introducing six contrasting strategies that do not require additional feedback labeling procedures. Regarding RQ2, we conduct thorough experiments demonstrating that PopAlign significantly outperforms existing methods, leading to more comprehensive alignment.",
            "score": 18,
            "issue_id": 147,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "d458841995668004",
            "authors": [
                "Zekun Moore Wang",
                "Shawn Wang",
                "Kang Zhu",
                "Jiaheng Liu",
                "Ke Xu",
                "Jie Fu",
                "Wangchunshu Zhou",
                "Wenhao Huang"
            ],
            "affiliations": [
                "201.AI",
                "AIWaves",
                "Beihang University",
                "HKUST",
                "Tsinghua University"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13785.jpg",
            "data": {
                "categories": [
                    "#rlhf",
                    "#training",
                    "#security",
                    "#architecture",
                    "#alignment"
                ],
                "emoji": "🎯",
                "ru": {
                    "title": "PopAlign: Комплексное выравнивание языковых моделей через разнообразные контрастные паттерны",
                    "desc": "Статья представляет новый подход к выравниванию больших языковых моделей под названием PopAlign. Этот метод использует разнообразные контрастные паттерны на уровнях промпта, модели и пайплайна для улучшения данных о предпочтениях. PopAlign решает проблемы ограниченности традиционных методов, таких как RLHF и RLAIF, и повышает устойчивость моделей к атакам типа jailbreaking. Эксперименты показывают, что PopAlign значительно превосходит существующие методы, обеспечивая более комплексное выравнивание."
                },
                "en": {
                    "title": "PopAlign: Diversifying Patterns for Better Model Alignment",
                    "desc": "The paper discusses improving the alignment of large language models (LLMs) by using a new framework called PopAlign. Traditional methods like RLHF and RLAIF have limitations due to their reliance on limited contrasting patterns, which can make models vulnerable to jailbreaking attacks. PopAlign introduces diversified contrasting patterns across different levels, such as prompt, model, and pipeline, without needing extra feedback labeling. Experiments show that PopAlign enhances model alignment more effectively than existing methods, making models more robust and comprehensive."
                },
                "zh": {
                    "title": "PopAlign：多样化对比策略提升模型对齐",
                    "desc": "这篇论文研究了如何通过对比模式来更好地调整大型语言模型的输出，使其更符合人类的偏好。传统方法如RLHF和RLAIF在对比模式上存在局限性，导致模型对攻击的脆弱性。为了解决这个问题，作者提出了PopAlign框架，通过在提示、模型和流程层面引入多样化的对比策略来增强偏好数据。实验结果表明，PopAlign显著优于现有方法，实现了更全面的模型对齐。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13639",
            "title": "A Comparative Study on Reasoning Patterns of OpenAI's o1 Model",
            "url": "https://huggingface.co/papers/2410.13639",
            "abstract": "Enabling Large Language Models (LLMs) to handle a wider range of complex tasks (e.g., coding, math) has drawn great attention from many researchers. As LLMs continue to evolve, merely increasing the number of model parameters yields diminishing performance improvements and heavy computational costs. Recently, OpenAI's o1 model has shown that inference strategies (i.e., Test-time Compute methods) can also significantly enhance the reasoning capabilities of LLMs. However, the mechanisms behind these methods are still unexplored. In our work, to investigate the reasoning patterns of o1, we compare o1 with existing Test-time Compute methods (BoN, Step-wise BoN, Agent Workflow, and Self-Refine) by using OpenAI's GPT-4o as a backbone on general reasoning benchmarks in three domains (i.e., math, coding, commonsense reasoning). Specifically, first, our experiments show that the o1 model has achieved the best performance on most datasets. Second, as for the methods of searching diverse responses (e.g., BoN), we find the reward models' capability and the search space both limit the upper boundary of these methods. Third, as for the methods that break the problem into many sub-problems, the Agent Workflow has achieved better performance than Step-wise BoN due to the domain-specific system prompt for planning better reasoning processes. Fourth, it is worth mentioning that we have summarized six reasoning patterns of o1, and provided a detailed analysis on several reasoning benchmarks.",
            "score": 16,
            "issue_id": 162,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "39b4ff88e70ccaf2",
            "authors": [
                "Siwei Wu",
                "Zhongyuan Peng",
                "Xinrun Du",
                "Tuney Zheng",
                "Minghao Liu",
                "Jialong Wu",
                "Jiachen Ma",
                "Yizhi Li",
                "Jian Yang",
                "Wangchunshu Zhou",
                "Qunshu Lin",
                "Junbo Zhao",
                "Zhaoxiang Zhang",
                "Wenhao Huang",
                "Ge Zhang",
                "Chenghua Lin",
                "J. H. Liu"
            ],
            "affiliations": [
                "2077AI",
                "Abaka AI",
                "M-A-P",
                "OpenO1 Team",
                "University of Chinese Academy of Sciences",
                "University of Manchester",
                "Zhejiang University"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13639.jpg",
            "data": {
                "categories": [
                    "#reasoning",
                    "#rl",
                    "#benchmark",
                    "#inference",
                    "#optimization",
                    "#math",
                    "#plp"
                ],
                "emoji": "🧠",
                "ru": {
                    "title": "Раскрытие секретов рассуждений языковых моделей",
                    "desc": "Статья исследует механизмы улучшения рассуждений в крупных языковых моделях (LLM) с помощью стратегий вывода. Авторы сравнивают модель o1 от OpenAI с другими методами вычислений во время тестирования на задачах рассуждения в областях математики, программирования и здравого смысла. Исследование выявило шесть паттернов рассуждений модели o1 и проанализировало их эффективность. Результаты показывают, что o1 достигает лучших результатов на большинстве наборов данных, а также демонстрируют ограничения и преимущества различных методов улучшения рассуждений в LLM."
                },
                "en": {
                    "title": "Smarter, Not Bigger: Enhancing LLMs with Inference Strategies",
                    "desc": "This paper explores how Large Language Models (LLMs) can be improved not just by adding more parameters, but by using smarter inference strategies. The study focuses on OpenAI's o1 model, comparing it with other Test-time Compute methods to understand its reasoning capabilities. The research finds that o1 outperforms other models in reasoning tasks across math, coding, and commonsense domains. It also identifies six reasoning patterns in o1, providing insights into how these models can be optimized for complex tasks."
                },
                "zh": {
                    "title": "探索大型语言模型的推理新策略",
                    "desc": "这篇论文研究了如何让大型语言模型（LLM）更好地处理复杂任务，如编程和数学。研究发现，仅仅增加模型参数的数量并不能显著提高性能，反而会增加计算成本。通过比较不同的推理策略，发现o1模型在大多数数据集上表现最佳，并总结了六种推理模式。研究还指出，搜索多样化响应的方法和将问题分解为子问题的方法在性能上有不同的限制和优势。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13841",
            "title": "A Unified View of Delta Parameter Editing in Post-Trained Large-Scale Models",
            "url": "https://huggingface.co/papers/2410.13841",
            "abstract": "Post-training has emerged as a crucial paradigm for adapting large-scale pre-trained models to various tasks, whose effects are fully reflected by delta parameters (i.e., the disparity between post-trained and pre-trained parameters). While numerous studies have explored delta parameter properties via operations like pruning, quantization, low-rank approximation, and extrapolation, a unified framework for systematically examining these characteristics has been lacking. In this paper, we propose a novel perspective based on Riemann sum approximation of the loss function to elucidate delta parameter editing operations. Our analysis categorizes existing methods into three classes based on their post-editing performance: competitive, decreased, and improved, explaining how they are expressed by the Riemann sum approximation term and how they alter the model performance. Extensive experiments on both visual and language models, including ViT, LLaMA 3, Qwen 2, and Mistral, corroborate our theoretical findings. Furthermore, we introduce extensions to existing techniques like DARE and BitDelta, highlighting their limitations in leveraging the properties of delta parameters and reorganizing them into general expressions to enhance the applicability and effectiveness of delta parameter editing in post-trained models.",
            "score": 14,
            "issue_id": 146,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "d5678be0144ffffb",
            "authors": [
                "Qiaoyu Tang",
                "Le Yu",
                "Bowen Yu",
                "Hongyu Lin",
                "Keming Lu",
                "Yaojie Lu",
                "Xianpei Han",
                "Le Sun"
            ],
            "affiliations": [
                "Alibaba Group",
                "Chinese Information Processing Laboratory, Institute of Software, Chinese Academy of Sciences",
                "University of Chinese Academy of Sciences"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13841.jpg",
            "data": {
                "categories": [
                    "#cv",
                    "#inference",
                    "#optimization",
                    "#training",
                    "#transfer_learning",
                    "#architecture"
                ],
                "emoji": "🔬",
                "ru": {
                    "title": "Новый взгляд на дельта-параметры: аппроксимация Римана в пост-обучении",
                    "desc": "Статья представляет новый подход к анализу дельта-параметров в пост-обучении моделей машинного обучения, основанный на аппроксимации Римана функции потерь. Авторы классифицируют существующие методы редактирования дельта-параметров на три категории в зависимости от их влияния на производительность модели. Исследование включает эксперименты с визуальными и языковыми моделями, такими как ViT, LLaMA 3, Qwen 2 и Mistral. Предложенный подход позволяет улучшить применимость и эффективность редактирования дельта-параметров в пост-обученных моделях."
                },
                "en": {
                    "title": "Unlocking the Power of Delta Parameters in AI Models",
                    "desc": "The paper introduces a new way to understand how changes made to large pre-trained models, called delta parameters, affect their performance. By using a mathematical tool called Riemann sum approximation, the authors categorize different methods of editing these parameters into three groups based on how they impact the model's performance. They test their ideas on various models, showing that their approach helps explain why some methods work better than others. Additionally, they suggest improvements to existing techniques to make them more effective in adjusting these delta parameters."
                },
                "zh": {
                    "title": "揭示后训练模型中delta参数的奥秘",
                    "desc": "这篇论文提出了一种基于黎曼和近似损失函数的新视角来解释后训练模型中的delta参数编辑操作。研究将现有方法根据编辑后性能分为三类：竞争性、下降和提升，并通过黎曼和近似项解释这些方法如何影响模型性能。通过在视觉和语言模型上的大量实验，验证了理论发现的正确性。此外，论文还对现有技术如DARE和BitDelta进行了扩展，指出其在利用delta参数特性上的局限性，并将其重新组织为通用表达式以提高后训练模型中delta参数编辑的适用性和有效性。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13334",
            "title": "Do LLMs Have Political Correctness? Analyzing Ethical Biases and Jailbreak Vulnerabilities in AI Systems",
            "url": "https://huggingface.co/papers/2410.13334",
            "abstract": "Although large language models (LLMs) demonstrate impressive proficiency in various tasks, they present potential safety risks, such as `jailbreaks', where malicious inputs can coerce LLMs into generating harmful content. To address these issues, many LLM developers have implemented various safety measures to align these models. This alignment involves several techniques, including data filtering during pre-training, supervised fine-tuning, reinforcement learning from human feedback, and red-teaming exercises. These methods often introduce deliberate and intentional biases similar to Political Correctness (PC) to ensure the ethical behavior of LLMs. In this paper, we delve into the intentional biases injected into LLMs for safety purposes and examine methods to circumvent these safety alignment techniques. Notably, these intentional biases result in a jailbreaking success rate in GPT-4o models that differs by 20% between non-binary and cisgender keywords and by 16% between white and black keywords, even when the other parts of the prompts are identical. We introduce the concept of PCJailbreak, highlighting the inherent risks posed by these safety-induced biases. Additionally, we propose an efficient defense method PCDefense, which prevents jailbreak attempts by injecting defense prompts prior to generation. PCDefense stands as an appealing alternative to Guard Models, such as Llama-Guard, that require additional inference cost after text generation. Our findings emphasize the urgent need for LLM developers to adopt a more responsible approach when designing and implementing safety measures.",
            "score": 12,
            "issue_id": 149,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "cc9053b7f6f516ca",
            "authors": [
                "Isack Lee",
                "Haebin Seong"
            ],
            "affiliations": [
                "Theori Inc."
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13334.jpg",
            "data": {
                "categories": [
                    "#rlhf",
                    "#inference",
                    "#ethics",
                    "#security",
                    "#architecture",
                    "#alignment"
                ],
                "emoji": "🛡️",
                "ru": {
                    "title": "Балансируя на грани: безопасность и этика в больших языковых моделях",
                    "desc": "Статья исследует проблемы безопасности больших языковых моделей (LLM) и методы их обхода. Авторы вводят понятие PCJailbreak, демонстрирующее риски, связанные с намеренными смещениями, внедренными в LLM для обеспечения безопасности. Исследование показывает значительную разницу в успешности взлома моделей GPT-4 при использовании различных ключевых слов. Предложен метод защиты PCDefense, предотвращающий попытки взлома путем внедрения защитных промптов перед генерацией текста."
                },
                "en": {
                    "title": "Balancing Safety and Bias: Navigating the Ethical Landscape of LLMs",
                    "desc": "This paper explores the safety risks associated with large language models (LLMs), particularly focusing on 'jailbreaks' where malicious inputs lead to harmful outputs. It discusses how developers use techniques like data filtering, supervised fine-tuning, and reinforcement learning from human feedback to align LLMs with ethical standards, often introducing biases similar to Political Correctness. The study reveals that these biases can affect the success rate of jailbreaks, showing significant differences based on keywords related to gender and race. To counteract these vulnerabilities, the paper introduces PCDefense, a method that injects defense prompts to prevent jailbreaks without the additional costs associated with other guard models."
                },
                "zh": {
                    "title": "确保大型语言模型的安全与道德行为",
                    "desc": "大型语言模型（LLMs）在执行各种任务时表现出色，但也存在安全风险，如通过恶意输入诱导生成有害内容。为解决这些问题，开发者采用了数据过滤、监督微调和人类反馈强化学习等方法来对齐模型。这些方法引入了类似政治正确性的偏见，以确保模型的道德行为。本文探讨了这些偏见的影响，并提出了一种名为PCDefense的防御方法，以防止模型被破解。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13832",
            "title": "VidPanos: Generative Panoramic Videos from Casual Panning Videos",
            "url": "https://huggingface.co/papers/2410.13832",
            "abstract": "Panoramic image stitching provides a unified, wide-angle view of a scene that extends beyond the camera's field of view. Stitching frames of a panning video into a panoramic photograph is a well-understood problem for stationary scenes, but when objects are moving, a still panorama cannot capture the scene. We present a method for synthesizing a panoramic video from a casually-captured panning video, as if the original video were captured with a wide-angle camera. We pose panorama synthesis as a space-time outpainting problem, where we aim to create a full panoramic video of the same length as the input video. Consistent completion of the space-time volume requires a powerful, realistic prior over video content and motion, for which we adapt generative video models. Existing generative models do not, however, immediately extend to panorama completion, as we show. We instead apply video generation as a component of our panorama synthesis system, and demonstrate how to exploit the strengths of the models while minimizing their limitations. Our system can create video panoramas for a range of in-the-wild scenes including people, vehicles, and flowing water, as well as stationary background features.",
            "score": 12,
            "issue_id": 147,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "bbef29ff3d52bc13",
            "authors": [
                "Jingwei Ma",
                "Erika Lu",
                "Roni Paiss",
                "Shiran Zada",
                "Aleksander Holynski",
                "Tali Dekel",
                "Brian Curless",
                "Michael Rubinstein",
                "Forrester Cole"
            ],
            "affiliations": [
                "Google DeepMind, Israel",
                "Google DeepMind, USA",
                "UC Berkeley, USA",
                "University of Washington, USA",
                "Weitzmann Institute of Science, Israel"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13832.jpg",
            "data": {
                "categories": [
                    "#diffusion",
                    "#cv",
                    "#video",
                    "#games",
                    "#3d",
                    "#architecture"
                ],
                "emoji": "🎥",
                "ru": {
                    "title": "Панорамное видео: от обычной съемки к широкоугольному обзору",
                    "desc": "Статья представляет метод синтеза панорамного видео из обычного панорамного видео, снятого вручную. Авторы рассматривают задачу как проблему заполнения пространственно-временного объема, используя генеративные модели видео. Предложенный подход позволяет создавать видеопанорамы для различных сцен, включая движущиеся объекты и людей. Система преодолевает ограничения существующих генеративных моделей для эффективного синтеза панорамных видео."
                },
                "en": {
                    "title": "Expanding Horizons: Creating Panoramic Videos from Panning Footage",
                    "desc": "The paper introduces a novel method for creating panoramic videos from panning video footage, even when the scene includes moving objects. This is achieved by treating the problem as a space-time outpainting task, where the goal is to generate a complete panoramic video that matches the length of the original footage. The authors adapt generative video models to handle the complex task of filling in the missing parts of the video, ensuring realistic motion and content. They demonstrate the effectiveness of their system across various dynamic scenes, overcoming the limitations of existing generative models."
                },
                "zh": {
                    "title": "动态场景的全景视频合成新方法",
                    "desc": "这篇论文介绍了一种从普通视频合成全景视频的方法，解决了传统全景照片无法捕捉动态场景的问题。作者将全景合成视为一个时空外推问题，利用生成视频模型来实现视频内容和运动的逼真补全。现有的生成模型不能直接用于全景补全，因此作者将其作为全景合成系统的一部分，发挥模型的优势并减少其局限性。该系统能够为包括人、车辆和流水等动态场景生成全景视频。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.09426",
            "title": "FlatQuant: Flatness Matters for LLM Quantization",
            "url": "https://huggingface.co/papers/2410.09426",
            "abstract": "Recently, quantization has been widely used for the compression and acceleration of large language models~(LLMs). Due to the outliers in LLMs, it is crucial to flatten weights and activations to minimize quantization error with the equally spaced quantization points. Prior research explores various pre-quantization transformations to suppress outliers, such as per-channel scaling and Hadamard transformation. However, we observe that these transformed weights and activations can still remain steep and outspread. In this paper, we propose FlatQuant (Fast and Learnable Affine Transformation), a new post-training quantization approach to enhance flatness of weights and activations. Our approach identifies optimal affine transformations tailored to each linear layer, calibrated in hours via a lightweight objective. To reduce runtime overhead, we apply Kronecker decomposition to the transformation matrices, and fuse all operations in FlatQuant into a single kernel. Extensive experiments show that FlatQuant sets up a new state-of-the-art quantization benchmark. For instance, it achieves less than 1% accuracy drop for W4A4 quantization on the LLaMA-3-70B model, surpassing SpinQuant by 7.5%. For inference latency, FlatQuant reduces the slowdown induced by pre-quantization transformation from 0.26x of QuaRot to merely 0.07x, bringing up to 2.3x speedup for prefill and 1.7x speedup for decoding, respectively. Code is available at: https://github.com/ruikangliu/FlatQuant.",
            "score": 12,
            "issue_id": 146,
            "pub_date": "2024-10-12",
            "pub_date_card": {
                "ru": "12 октября",
                "en": "October 12",
                "zh": "10月12日"
            },
            "hash": "edfd47f3735a6a6e",
            "authors": [
                "Yuxuan Sun",
                "Ruikang Liu",
                "Haoli Bai",
                "Han Bao",
                "Kang Zhao",
                "Yuening Li",
                "Jiaxin Hu",
                "Xianzhi Yu",
                "Lu Hou",
                "Chun Yuan",
                "Xin Jiang",
                "Wulong Liu",
                "Jun Yao"
            ],
            "affiliations": [
                "Huawei Noahs Ark Lab",
                "Shenzhen International Graduate School, Tsinghua University",
                "The Chinese University of Hong Kong"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.09426.jpg",
            "data": {
                "categories": [
                    "#benchmark",
                    "#inference",
                    "#optimization",
                    "#open_source",
                    "#architecture"
                ],
                "emoji": "🔢",
                "ru": {
                    "title": "FlatQuant: Быстрое и обучаемое квантование для ускорения больших языковых моделей",
                    "desc": "FlatQuant - это новый подход к квантованию после обучения, направленный на улучшение сглаженности весов и активаций в больших языковых моделях (LLM). Метод использует оптимальные аффинные преобразования для каждого линейного слоя, калибруемые за часы с помощью облегченной целевой функции. Для снижения накладных расходов во время выполнения применяется разложение Кронекера к матрицам преобразования. FlatQuant устанавливает новый эталон квантования, достигая менее 1% падения точности для квантования W4A4 на модели LLaMA-3-70B."
                },
                "en": {
                    "title": "FlatQuant: Revolutionizing Model Quantization with Speed and Precision",
                    "desc": "The paper introduces FlatQuant, a novel post-training quantization method designed to improve the flatness of weights and activations in large language models. By using learnable affine transformations tailored to each linear layer, FlatQuant minimizes quantization errors and enhances model performance. The approach employs Kronecker decomposition to streamline operations, significantly reducing runtime overhead and improving inference speed. Extensive testing demonstrates that FlatQuant achieves superior accuracy and speed compared to existing methods, setting a new benchmark in model quantization."
                },
                "zh": {
                    "title": "FlatQuant：提升量化精度与速度的新方法",
                    "desc": "量化技术被广泛用于压缩和加速大型语言模型，但由于异常值的存在，量化误差较大。为了解决这个问题，本文提出了一种新的后训练量化方法FlatQuant，通过快速可学习的仿射变换来增强权重和激活的平坦性。FlatQuant通过克罗内克分解减少运行时开销，并将所有操作融合到一个内核中。实验表明，FlatQuant在量化基准上达到了新的水平，显著提高了模型的准确性和推理速度。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13198",
            "title": "Failing Forward: Improving Generative Error Correction for ASR with Synthetic Data and Retrieval Augmentation",
            "url": "https://huggingface.co/papers/2410.13198",
            "abstract": "Generative Error Correction (GEC) has emerged as a powerful post-processing method to enhance the performance of Automatic Speech Recognition (ASR) systems. However, we show that GEC models struggle to generalize beyond the specific types of errors encountered during training, limiting their ability to correct new, unseen errors at test time, particularly in out-of-domain (OOD) scenarios. This phenomenon amplifies with named entities (NEs), where, in addition to insufficient contextual information or knowledge about the NEs, novel NEs keep emerging. To address these issues, we propose DARAG (Data- and Retrieval-Augmented Generative Error Correction), a novel approach designed to improve GEC for ASR in in-domain (ID) and OOD scenarios. We augment the GEC training dataset with synthetic data generated by prompting LLMs and text-to-speech models, thereby simulating additional errors from which the model can learn. For OOD scenarios, we simulate test-time errors from new domains similarly and in an unsupervised fashion. Additionally, to better handle named entities, we introduce retrieval-augmented correction by augmenting the input with entities retrieved from a database. Our approach is simple, scalable, and both domain- and language-agnostic. We experiment on multiple datasets and settings, showing that DARAG outperforms all our baselines, achieving 8\\% -- 30\\% relative WER improvements in ID and 10\\% -- 33\\% improvements in OOD settings.",
            "score": 9,
            "issue_id": 146,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "0385e3248b804521",
            "authors": [
                "Sreyan Ghosh",
                "Mohammad Sadegh Rasooli",
                "Michael Levit",
                "Peidong Wang",
                "Jian Xue",
                "Dinesh Manocha",
                "Jinyu Li"
            ],
            "affiliations": [
                "Microsoft, USA",
                "University of Maryland, College Park, USA"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13198.jpg",
            "data": {
                "categories": [
                    "#rag",
                    "#synthetic",
                    "#multilingual",
                    "#data",
                    "#dataset",
                    "#transfer_learning",
                    "#games",
                    "#audio"
                ],
                "emoji": "🗣️",
                "ru": {
                    "title": "DARAG: Улучшение коррекции ошибок ASR с помощью синтетических данных и извлечения информации",
                    "desc": "Статья представляет новый подход DARAG для улучшения генеративной коррекции ошибок (GEC) в системах автоматического распознавания речи (ASR). DARAG использует синтетические данные, созданные с помощью языковых моделей и систем text-to-speech, для расширения обучающего набора данных. Метод также включает коррекцию с использованием извлечения информации для лучшей обработки именованных сущностей. DARAG показывает значительные улучшения в сравнении с базовыми моделями как для доменных, так и для внедоменных сценариев."
                },
                "en": {
                    "title": "DARAG: Revolutionizing Error Correction in Speech Recognition",
                    "desc": "The paper discusses a new method called DARAG, which enhances Generative Error Correction (GEC) for Automatic Speech Recognition (ASR) systems. Traditional GEC models struggle with unseen errors, especially in out-of-domain scenarios, but DARAG addresses this by using synthetic data and retrieval-augmented correction. By generating additional training data and retrieving relevant named entities, DARAG improves the model's ability to correct errors in both familiar and new contexts. Experiments show that DARAG significantly reduces word error rates, making it a robust solution for diverse ASR applications."
                },
                "zh": {
                    "title": "DARAG：提升语音识别错误校正的新方法",
                    "desc": "本文介绍了一种名为DARAG的新方法，用于改进自动语音识别系统的生成式错误校正。传统的生成式错误校正模型在面对未见过的错误时表现不佳，尤其是在跨领域场景中。DARAG通过使用大语言模型和文本到语音模型生成的合成数据来增强训练数据集，从而提高模型的泛化能力。此外，为了更好地处理命名实体，DARAG引入了检索增强校正，通过从数据库中检索实体来丰富输入信息。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13854",
            "title": "Can MLLMs Understand the Deep Implication Behind Chinese Images?",
            "url": "https://huggingface.co/papers/2410.13854",
            "abstract": "As the capabilities of Multimodal Large Language Models (MLLMs) continue to improve, the need for higher-order capability evaluation of MLLMs is increasing. However, there is a lack of work evaluating MLLM for higher-order perception and understanding of Chinese visual content. To fill the gap, we introduce the **C**hinese **I**mage **I**mplication understanding **Bench**mark, **CII-Bench**, which aims to assess the higher-order perception and understanding capabilities of MLLMs for Chinese images. CII-Bench stands out in several ways compared to existing benchmarks. Firstly, to ensure the authenticity of the Chinese context, images in CII-Bench are sourced from the Chinese Internet and manually reviewed, with corresponding answers also manually crafted. Additionally, CII-Bench incorporates images that represent Chinese traditional culture, such as famous Chinese traditional paintings, which can deeply reflect the model's understanding of Chinese traditional culture. Through extensive experiments on CII-Bench across multiple MLLMs, we have made significant findings. Initially, a substantial gap is observed between the performance of MLLMs and humans on CII-Bench. The highest accuracy of MLLMs attains 64.4%, where as human accuracy averages 78.2%, peaking at an impressive 81.0%. Subsequently, MLLMs perform worse on Chinese traditional culture images, suggesting limitations in their ability to understand high-level semantics and lack a deep knowledge base of Chinese traditional culture. Finally, it is observed that most models exhibit enhanced accuracy when image emotion hints are incorporated into the prompts. We believe that CII-Bench will enable MLLMs to gain a better understanding of Chinese semantics and Chinese-specific images, advancing the journey towards expert artificial general intelligence (AGI). Our project is publicly available at https://cii-bench.github.io/.",
            "score": 8,
            "issue_id": 150,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "bea0da184db20058",
            "authors": [
                "Chenhao Zhang",
                "Xi Feng",
                "Yuelin Bai",
                "Xinrun Du",
                "Jinchang Hou",
                "Kaixin Deng",
                "Guangzeng Han",
                "Qinrui Li",
                "Bingli Wang",
                "Jiaheng Liu",
                "Xingwei Qu",
                "Yifei Zhang",
                "Qixuan Zhao",
                "Yiming Liang",
                "Ziqiang Liu",
                "Feiteng Fang",
                "Min Yang",
                "Wenhao Huang",
                "Chenghua Lin",
                "Ge Zhang",
                "Shiwen Ni"
            ],
            "affiliations": [
                "CDUT",
                "Huazhong University of Science and Technology",
                "M-A-P 501.ai",
                "SICAU",
                "SWU",
                "Shenzhen Institute of Advanced Technology, CAS",
                "UCAS",
                "University of California, Santa Barbara",
                "University of Manchester",
                "University of Memphis",
                "University of Science and Technology of China"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13854.jpg",
            "data": {
                "categories": [
                    "#science",
                    "#benchmark",
                    "#multilingual",
                    "#cv",
                    "#agi",
                    "#multimodal",
                    "#open_source"
                ],
                "emoji": "🇨🇳",
                "ru": {
                    "title": "CII-Bench: новый рубеж в понимании китайских изображений для MLLM",
                    "desc": "CII-Bench - это новый бенчмарк для оценки способностей мультимодальных языковых моделей (MLLM) к восприятию и пониманию китайских изображений высокого уровня. Он включает аутентичные китайские изображения, в том числе отражающие традиционную культуру. Эксперименты показали значительный разрыв между производительностью MLLM и людей на CII-Bench, особенно в понимании традиционной китайской культуры. Авторы отмечают, что включение эмоциональных подсказок в промпты улучшает точность моделей."
                },
                "en": {
                    "title": "Bridging the Cultural Gap in AI Understanding",
                    "desc": "The paper introduces CII-Bench, a benchmark designed to evaluate the higher-order perception and understanding capabilities of Multimodal Large Language Models (MLLMs) for Chinese visual content. CII-Bench uses images sourced from the Chinese Internet and includes traditional cultural elements to test the models' understanding of Chinese culture. Experiments reveal that MLLMs perform significantly worse than humans, especially on images related to Chinese traditional culture, indicating a gap in high-level semantic understanding. The study also finds that MLLMs improve in accuracy when emotional cues are included in prompts, suggesting a potential area for enhancing model performance."
                },
                "zh": {
                    "title": "提升多模态大语言模型的中文图像理解能力",
                    "desc": "这篇论文介绍了一个名为CII-Bench的基准，用于评估多模态大语言模型对中文图像的高级感知和理解能力。CII-Bench的图像来自中国互联网，并经过人工审核，确保了中文语境的真实性。实验结果显示，多模态大语言模型在理解中国传统文化图像时表现较差，表明其在高层次语义理解和文化知识方面的不足。通过CII-Bench的测试，可以帮助这些模型更好地理解中文语义和特定的中文图像。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13360",
            "title": "Remember, Retrieve and Generate: Understanding Infinite Visual Concepts as Your Personalized Assistant",
            "url": "https://huggingface.co/papers/2410.13360",
            "abstract": "The development of large language models (LLMs) has significantly enhanced the capabilities of multimodal LLMs (MLLMs) as general assistants. However, lack of user-specific knowledge still restricts their application in human's daily life. In this paper, we introduce the Retrieval Augmented Personalization (RAP) framework for MLLMs' personalization. Starting from a general MLLM, we turn it into a personalized assistant in three steps. (a) Remember: We design a key-value database to store user-related information, e.g., user's name, avatar and other attributes. (b) Retrieve: When the user initiates a conversation, RAP will retrieve relevant information from the database using a multimodal retriever. (c) Generate: The input query and retrieved concepts' information are fed into MLLMs to generate personalized, knowledge-augmented responses. Unlike previous methods, RAP allows real-time concept editing via updating the external database. To further improve generation quality and alignment with user-specific information, we design a pipeline for data collection and create a specialized dataset for personalized training of MLLMs. Based on the dataset, we train a series of MLLMs as personalized multimodal assistants. By pretraining on large-scale dataset, RAP-MLLMs can generalize to infinite visual concepts without additional finetuning. Our models demonstrate outstanding flexibility and generation quality across a variety of tasks, such as personalized image captioning, question answering and visual recognition. The code, data and models are available at https://github.com/Hoar012/RAP-MLLM.",
            "score": 8,
            "issue_id": 150,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "2355c49dae930058",
            "authors": [
                "Haoran Hao",
                "Jiaming Han",
                "Changsheng Li",
                "Yu-Feng Li",
                "Xiangyu Yue"
            ],
            "affiliations": [
                "Beijing Institute of Technology",
                "MMLab, The Chinese University of Hong Kong",
                "National Key Laboratory for Novel Software Technology, Nanjing University"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13360.jpg",
            "data": {
                "categories": [
                    "#rag",
                    "#synthetic",
                    "#agi",
                    "#multimodal",
                    "#data",
                    "#training",
                    "#dataset",
                    "#open_source",
                    "#alignment"
                ],
                "emoji": "👤",
                "ru": {
                    "title": "Персонализация мультимодальных ИИ-ассистентов с помощью RAP",
                    "desc": "Статья представляет фреймворк Retrieval Augmented Personalization (RAP) для персонализации мультимодальных языковых моделей (MLLM). RAP использует базу данных ключ-значение для хранения информации о пользователе, извлекает релевантные данные с помощью мультимодального ретривера и генерирует персонализированные ответы. Авторы создали специализированный датасет для обучения MLLM и разработали серию моделей, способных обобщаться на бесконечное количество визуальных концепций без дополнительной настройки. Модели демонстрируют высокую гибкость и качество генерации в различных задачах, включая персонализированное описание изображений и визуальное распознавание."
                },
                "en": {
                    "title": "Personalized AI: Tailoring Multimodal Models to You",
                    "desc": "The paper introduces the Retrieval Augmented Personalization (RAP) framework to enhance the personalization of multimodal large language models (MLLMs). RAP uses a key-value database to store user-specific information, which is retrieved during interactions to generate personalized responses. This approach allows for real-time updates and editing of user data, improving the model's ability to align with user-specific needs. The framework is trained on a specialized dataset, enabling the models to perform tasks like personalized image captioning and question answering with high flexibility and quality."
                },
                "zh": {
                    "title": "RAP：个性化多模态助手的未来",
                    "desc": "这篇论文介绍了一种名为RAP的框架，用于个性化多模态大语言模型（MLLMs）。RAP通过记忆、检索和生成三个步骤，将通用的MLLM转变为个性化助手。它使用一个键值数据库存储用户信息，并在对话时检索相关信息以生成个性化响应。RAP的创新在于允许实时编辑概念，并通过专门的数据集提高生成质量。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.09019",
            "title": "MedMobile: A mobile-sized language model with expert-level clinical capabilities",
            "url": "https://huggingface.co/papers/2410.09019",
            "abstract": "Language models (LMs) have demonstrated expert-level reasoning and recall abilities in medicine. However, computational costs and privacy concerns are mounting barriers to wide-scale implementation. We introduce a parsimonious adaptation of phi-3-mini, MedMobile, a 3.8 billion parameter LM capable of running on a mobile device, for medical applications. We demonstrate that MedMobile scores 75.7% on the MedQA (USMLE), surpassing the passing mark for physicians (~60%), and approaching the scores of models 100 times its size. We subsequently perform a careful set of ablations, and demonstrate that chain of thought, ensembling, and fine-tuning lead to the greatest performance gains, while unexpectedly retrieval augmented generation fails to demonstrate significant improvements",
            "score": 8,
            "issue_id": 147,
            "pub_date": "2024-10-11",
            "pub_date_card": {
                "ru": "11 октября",
                "en": "October 11",
                "zh": "10月11日"
            },
            "hash": "dcae17e6b8ab798e",
            "authors": [
                "Krithik Vishwanath",
                "Jaden Stryker",
                "Anton Alyakin",
                "Daniel Alexander Alber",
                "Eric Karl Oermann"
            ],
            "affiliations": [
                "Center for Data Science, New York University, New York, New York, 10016",
                "Department of Aerospace Engineering and Engineering Mechanics, The University of Texas at Austin, Austin, Texas, 78712",
                "Department of Neurological Surgery, NYU Langone Medical Center, New York, New York, 10016",
                "Department of Neurosurgery, Washington University School of Medicine in St. Louis, St. Louis, Missouri, 63110",
                "Department of Radiology, NYU Langone Medical Center, New York, New York, 10016"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.09019.jpg",
            "data": {
                "categories": [
                    "#science",
                    "#rag",
                    "#small_models",
                    "#reasoning",
                    "#optimization",
                    "#healthcare",
                    "#training"
                ],
                "emoji": "🩺",
                "ru": {
                    "title": "Компактная медицинская ИИ-модель превосходит врачей в тестировании",
                    "desc": "Статья представляет MedMobile - компактную языковую модель для медицинских приложений, способную работать на мобильных устройствах. Модель, основанная на phi-3-mini, имеет всего 3,8 миллиарда параметров, но демонстрирует впечатляющие результаты на тесте MedQA (USMLE), превосходя проходной балл для врачей. Авторы проводят тщательный анализ, показывая, что цепочка рассуждений, ансамблирование и дообучение приводят к наибольшему улучшению производительности. Интересно, что retrieval augmented generation не показывает значительных улучшений."
                },
                "en": {
                    "title": "MedMobile: Compact Power for Medical AI on the Go",
                    "desc": "The paper introduces MedMobile, a compact language model designed for medical applications that can operate on mobile devices, addressing issues of computational cost and privacy. MedMobile, with 3.8 billion parameters, achieves a 75.7% score on the MedQA exam, surpassing the physician passing mark and nearing the performance of much larger models. The study finds that techniques like chain of thought, ensembling, and fine-tuning significantly enhance the model's performance. Interestingly, retrieval augmented generation does not provide notable improvements, contrary to expectations."
                },
                "zh": {
                    "title": "小而强大的医学语言模型：MedMobile",
                    "desc": "这篇论文介绍了一种名为MedMobile的语言模型，它能够在移动设备上运行，并在医学应用中表现出色。MedMobile在MedQA测试中得分75.7%，超过了医生的及格线，并接近比它大100倍的模型的得分。研究发现，思维链、集成和微调是提高性能的关键，而检索增强生成未能显著改善结果。该模型的设计旨在降低计算成本和解决隐私问题，使其更易于广泛应用。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13852",
            "title": "Retrospective Learning from Interactions",
            "url": "https://huggingface.co/papers/2410.13852",
            "abstract": "Multi-turn interactions between large language models (LLMs) and users naturally include implicit feedback signals. If an LLM responds in an unexpected way to an instruction, the user is likely to signal it by rephrasing the request, expressing frustration, or pivoting to an alternative task. Such signals are task-independent and occupy a relatively constrained subspace of language, allowing the LLM to identify them even if it fails on the actual task. This creates an avenue for continually learning from interactions without additional annotations. We introduce ReSpect, a method to learn from such signals in past interactions via retrospection. We deploy ReSpect in a new multimodal interaction scenario, where humans instruct an LLM to solve an abstract reasoning task with a combinatorial solution space. Through thousands of interactions with humans, we show how ReSpect gradually improves task completion rate from 31% to 82%, all without any external annotation.",
            "score": 8,
            "issue_id": 146,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "e59c253e0efcaa3b",
            "authors": [
                "Zizhao Chen",
                "Mustafa Omer Gul",
                "Yiwei Chen",
                "Gloria Geng",
                "Anne Wu",
                "Yoav Artzi"
            ],
            "affiliations": [
                "Department of Computer Science and Cornell Tech, Cornell University"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13852.jpg",
            "data": {
                "categories": [
                    "#reasoning",
                    "#rlhf",
                    "#multimodal",
                    "#training",
                    "#alignment"
                ],
                "emoji": "🧠",
                "ru": {
                    "title": "Обучение языковых моделей на неявных сигналах обратной связи",
                    "desc": "Статья представляет метод ReSpect, позволяющий языковым моделям (LLM) учиться на неявных сигналах обратной связи в ходе взаимодействия с пользователями. Авторы применяют ReSpect в сценарии мультимодального взаимодействия, где люди инструктируют LLM для решения задач абстрактного мышления. Эксперименты показывают, что ReSpect постепенно улучшает показатель успешного выполнения задач с 31% до 82% без дополнительной разметки. Это открывает возможности для постоянного обучения LLM в процессе взаимодействия с пользователями."
                },
                "en": {
                    "title": "Learning from Implicit Feedback: Enhancing LLMs with ReSpect",
                    "desc": "The paper discusses how large language models (LLMs) can learn from implicit feedback during multi-turn interactions with users. When users rephrase requests or express frustration, these signals can be detected by the LLM, even if it doesn't understand the task. The authors introduce a method called ReSpect, which allows LLMs to learn from these signals retrospectively, improving their performance over time. In a multimodal interaction scenario, ReSpect significantly increased task completion rates from 31% to 82% without needing additional annotations."
                },
                "zh": {
                    "title": "通过回顾信号，提升语言模型的学习能力",
                    "desc": "这篇论文介绍了一种名为ReSpect的方法，它通过回顾过去的互动信号来学习。用户在与大型语言模型互动时，可能会通过重新措辞或表达沮丧来提供隐性反馈。ReSpect利用这些信号来提高模型的任务完成率，从31%提升到82%。这种方法不需要额外的标注，展示了在多模态互动场景中持续学习的潜力。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13859",
            "title": "$γ-$MoD: Exploring Mixture-of-Depth Adaptation for Multimodal Large Language Models",
            "url": "https://huggingface.co/papers/2410.13859",
            "abstract": "Despite the significant progress in multimodal large language models (MLLMs), their high computational cost remains a barrier to real-world deployment. Inspired by the mixture of depths (MoDs) in natural language processing, we aim to address this limitation from the perspective of ``activated tokens''. Our key insight is that if most tokens are redundant for the layer computation, then can be skipped directly via the MoD layer. However, directly converting the dense layers of MLLMs to MoD layers leads to substantial performance degradation. To address this issue, we propose an innovative MoD adaptation strategy for existing MLLMs called gamma-MoD. In gamma-MoD, a novel metric is proposed to guide the deployment of MoDs in the MLLM, namely rank of attention maps (ARank). Through ARank, we can effectively identify which layer is redundant and should be replaced with the MoD layer. Based on ARank, we further propose two novel designs to maximize the computational sparsity of MLLM while maintaining its performance, namely shared vision-language router and masked routing learning. With these designs, more than 90% dense layers of the MLLM can be effectively converted to the MoD ones. To validate our method, we apply it to three popular MLLMs, and conduct extensive experiments on 9 benchmark datasets. Experimental results not only validate the significant efficiency benefit of gamma-MoD to existing MLLMs but also confirm its generalization ability on various MLLMs. For example, with a minor performance drop, i.e., -1.5%, gamma-MoD can reduce the training and inference time of LLaVA-HR by 31.0% and 53.2%, respectively.",
            "score": 7,
            "issue_id": 150,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "f393a253d43ee89d",
            "authors": [
                "Yaxin Luo",
                "Gen Luo",
                "Jiayi Ji",
                "Yiyi Zhou",
                "Xiaoshuai Sun",
                "Zhiqiang Shen",
                "Rongrong Ji"
            ],
            "affiliations": [
                "MBZUAI",
                "National University of Singapore",
                "OpenGVLab, Shanghai AI Laboratory",
                "Technical University Of Denmark",
                "Xiamen University"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13859.jpg",
            "data": {
                "categories": [
                    "#benchmark",
                    "#inference",
                    "#optimization",
                    "#multimodal",
                    "#training",
                    "#architecture"
                ],
                "emoji": "🚀",
                "ru": {
                    "title": "Ускорение мультимодальных LLM без потери качества",
                    "desc": "Статья представляет новый метод gamma-MoD для оптимизации мультимодальных больших языковых моделей (MLLM). Авторы предлагают стратегию адаптации Mixture of Depths (MoD) для существующих MLLM, используя метрику ранга карт внимания (ARank) для определения избыточных слоев. Метод включает общий маршрутизатор для зрения и языка, а также маскированное обучение маршрутизации. Эксперименты на 9 наборах данных показывают значительное повышение эффективности MLLM при минимальном снижении производительности."
                },
                "en": {
                    "title": "Efficient Multimodal Models: Cutting Costs, Not Performance",
                    "desc": "The paper addresses the high computational cost of multimodal large language models (MLLMs) by introducing a method called gamma-MoD, inspired by the mixture of depths (MoDs) concept. Gamma-MoD uses a novel metric, ARank, to identify and replace redundant layers in MLLMs with more efficient MoD layers, significantly reducing computational demands. The approach includes innovative designs like shared vision-language router and masked routing learning to maintain model performance while achieving computational sparsity. Experiments on various MLLMs demonstrate that gamma-MoD can reduce training and inference times substantially with minimal performance loss."
                },
                "zh": {
                    "title": "gamma-MoD：提升多模态大语言模型效率的新策略",
                    "desc": "这篇论文探讨了多模态大语言模型（MLLMs）的高计算成本问题，并提出了一种名为gamma-MoD的创新策略。通过引入注意力图的秩（ARank）作为指标，gamma-MoD能够识别哪些层是冗余的，并用混合深度（MoD）层替换。该方法通过共享视觉-语言路由器和掩码路由学习设计，成功将超过90%的密集层转换为MoD层。实验结果表明，gamma-MoD在保持性能的同时，大幅提高了MLLMs的计算效率。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.12957",
            "title": "MuVi: Video-to-Music Generation with Semantic Alignment and Rhythmic Synchronization",
            "url": "https://huggingface.co/papers/2410.12957",
            "abstract": "Generating music that aligns with the visual content of a video has been a challenging task, as it requires a deep understanding of visual semantics and involves generating music whose melody, rhythm, and dynamics harmonize with the visual narratives. This paper presents MuVi, a novel framework that effectively addresses these challenges to enhance the cohesion and immersive experience of audio-visual content. MuVi analyzes video content through a specially designed visual adaptor to extract contextually and temporally relevant features. These features are used to generate music that not only matches the video's mood and theme but also its rhythm and pacing. We also introduce a contrastive music-visual pre-training scheme to ensure synchronization, based on the periodicity nature of music phrases. In addition, we demonstrate that our flow-matching-based music generator has in-context learning ability, allowing us to control the style and genre of the generated music. Experimental results show that MuVi demonstrates superior performance in both audio quality and temporal synchronization. The generated music video samples are available at https://muvi-v2m.github.io.",
            "score": 7,
            "issue_id": 149,
            "pub_date": "2024-10-16",
            "pub_date_card": {
                "ru": "16 октября",
                "en": "October 16",
                "zh": "10月16日"
            },
            "hash": "6540fab2fcc644f2",
            "authors": [
                "Ruiqi Li",
                "Siqi Zheng",
                "Xize Cheng",
                "Ziang Zhang",
                "Shengpeng Ji",
                "Zhou Zhao"
            ],
            "affiliations": [
                "Alibaba Group",
                "Zhejiang University"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.12957.jpg",
            "data": {
                "categories": [
                    "#diffusion",
                    "#synthetic",
                    "#video",
                    "#multimodal",
                    "#training",
                    "#transfer_learning",
                    "#games",
                    "#audio",
                    "#architecture"
                ],
                "emoji": "🎵",
                "ru": {
                    "title": "MuVi: Гармония видео и музыки через искусственный интеллект",
                    "desc": "MuVi - это новая система для создания музыки, синхронизированной с видеоконтентом. Она анализирует видео с помощью специального визуального адаптера для извлечения релевантных характеристик. Затем эти характеристики используются для генерации музыки, соответствующей настроению, теме, ритму и темпу видео. Система также включает контрастивное предобучение для обеспечения синхронизации и генератор музыки на основе согласования потоков с возможностью обучения в контексте."
                },
                "en": {
                    "title": "Harmonizing Visuals and Sound: MuVi's Magic",
                    "desc": "The paper introduces MuVi, a framework designed to generate music that aligns with the visual content of videos by understanding visual semantics. MuVi uses a visual adaptor to extract features from videos, ensuring the generated music matches the video's mood, theme, rhythm, and pacing. A contrastive music-visual pre-training scheme is employed to synchronize music with video, leveraging the periodic nature of music phrases. The framework also allows for control over the style and genre of the music, demonstrating superior performance in audio quality and synchronization."
                },
                "zh": {
                    "title": "MuVi：让音乐与视觉完美同步的创新框架",
                    "desc": "这篇论文介绍了一个名为MuVi的新框架，用于生成与视频内容相匹配的音乐。MuVi通过一个专门设计的视觉适配器分析视频内容，提取与上下文和时间相关的特征。这些特征用于生成与视频情绪、主题、节奏和节拍相匹配的音乐。此外，我们引入了一种对比音乐-视觉预训练方案，以确保音乐与视频的同步性。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.12771",
            "title": "Open Materials 2024 (OMat24) Inorganic Materials Dataset and Models",
            "url": "https://huggingface.co/papers/2410.12771",
            "abstract": "The ability to discover new materials with desirable properties is critical for numerous applications from helping mitigate climate change to advances in next generation computing hardware. AI has the potential to accelerate materials discovery and design by more effectively exploring the chemical space compared to other computational methods or by trial-and-error. While substantial progress has been made on AI for materials data, benchmarks, and models, a barrier that has emerged is the lack of publicly available training data and open pre-trained models. To address this, we present a Meta FAIR release of the Open Materials 2024 (OMat24) large-scale open dataset and an accompanying set of pre-trained models. OMat24 contains over 110 million density functional theory (DFT) calculations focused on structural and compositional diversity. Our EquiformerV2 models achieve state-of-the-art performance on the Matbench Discovery leaderboard and are capable of predicting ground-state stability and formation energies to an F1 score above 0.9 and an accuracy of 20 meV/atom, respectively. We explore the impact of model size, auxiliary denoising objectives, and fine-tuning on performance across a range of datasets including OMat24, MPtraj, and Alexandria. The open release of the OMat24 dataset and models enables the research community to build upon our efforts and drive further advancements in AI-assisted materials science.",
            "score": 6,
            "issue_id": 154,
            "pub_date": "2024-10-16",
            "pub_date_card": {
                "ru": "16 октября",
                "en": "October 16",
                "zh": "10月16日"
            },
            "hash": "a98981f16e047445",
            "authors": [
                "Luis Barroso-Luque",
                "Muhammed Shuaibi",
                "Xiang Fu",
                "Brandon M. Wood",
                "Misko Dzamba",
                "Meng Gao",
                "Ammar Rizvi",
                "C. Lawrence Zitnick",
                "Zachary W. Ulissi"
            ],
            "affiliations": [
                "Fundamental AI Research (FAIR) at Meta"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.12771.jpg",
            "data": {
                "categories": [
                    "#science",
                    "#benchmark",
                    "#optimization",
                    "#training",
                    "#dataset",
                    "#open_source",
                    "#architecture"
                ],
                "emoji": "🧪",
                "ru": {
                    "title": "ИИ ускоряет открытие новых материалов с помощью крупнейшего открытого датасета",
                    "desc": "Статья представляет новый крупномасштабный открытый датасет Open Materials 2024 (OMat24) и набор предобученных моделей для ускорения открытия новых материалов с помощью искусственного интеллекта. OMat24 содержит более 110 миллионов расчетов методом функционала плотности (DFT), фокусируясь на структурном и композиционном разнообразии. Модели EquiformerV2 достигают наилучших результатов на лидерборде Matbench Discovery, предсказывая стабильность основного состояния и энергии образования с высокой точностью. Авторы исследуют влияние размера модели, вспомогательных целей денойзинга и файн-тюнинга на производительность на различных датасетах."
                },
                "en": {
                    "title": "Unlocking Material Innovation with AI: The OMat24 Revolution",
                    "desc": "The paper discusses the use of AI to accelerate the discovery of new materials by exploring chemical space more efficiently than traditional methods. It introduces the Open Materials 2024 (OMat24) dataset, which includes over 110 million density functional theory calculations, and a set of pre-trained models called EquiformerV2. These models achieve high performance in predicting material properties, such as ground-state stability, with impressive accuracy. The open release of this dataset and models aims to overcome the barrier of limited training data and foster further advancements in AI-driven materials science."
                },
                "zh": {
                    "title": "AI加速材料科学新突破",
                    "desc": "这篇论文介绍了一种新的大规模开放数据集OMat24和一组预训练模型，用于加速材料发现和设计。OMat24包含超过1.1亿个密度泛函理论计算，专注于结构和成分的多样性。EquiformerV2模型在Matbench Discovery排行榜上表现出色，能够高效预测材料的基态稳定性和形成能。通过公开OMat24数据集和模型，研究社区可以在此基础上进一步推动AI辅助材料科学的发展。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13618",
            "title": "LoLDU: Low-Rank Adaptation via Lower-Diag-Upper Decomposition for Parameter-Efficient Fine-Tuning",
            "url": "https://huggingface.co/papers/2410.13618",
            "abstract": "The rapid growth of model scale has necessitated substantial computational resources for fine-tuning. Existing approach such as Low-Rank Adaptation (LoRA) has sought to address the problem of handling the large updated parameters in full fine-tuning. However, LoRA utilize random initialization and optimization of low-rank matrices to approximate updated weights, which can result in suboptimal convergence and an accuracy gap compared to full fine-tuning. To address these issues, we propose LoLDU, a Parameter-Efficient Fine-Tuning (PEFT) approach that significantly reduces trainable parameters by 2600 times compared to regular PEFT methods while maintaining comparable performance. LoLDU leverages Lower-Diag-Upper Decomposition (LDU) to initialize low-rank matrices for faster convergence and orthogonality. We focus on optimizing the diagonal matrix for scaling transformations. To the best of our knowledge, LoLDU has the fewest parameters among all PEFT approaches. We conducted extensive experiments across 4 instruction-following datasets, 6 natural language understanding (NLU) datasets, 8 image classification datasets, and image generation datasets with multiple model types (LLaMA2, RoBERTa, ViT, and Stable Diffusion), providing a comprehensive and detailed analysis. Our open-source code can be accessed at https://github.com/SKDDJ/LoLDU{https://github.com/SKDDJ/LoLDU}.",
            "score": 6,
            "issue_id": 147,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "446dc7eb391be2b2",
            "authors": [
                "Yiming Shi",
                "Jiwei Wei",
                "Yujia Wu",
                "Ran Ran",
                "Chengwei Sun",
                "Shiyuan He",
                "Yang Yang"
            ],
            "affiliations": [],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13618.jpg",
            "data": {
                "categories": [
                    "#small_models",
                    "#cv",
                    "#optimization",
                    "#training",
                    "#dataset",
                    "#open_source",
                    "#architecture"
                ],
                "emoji": "🔬",
                "ru": {
                    "title": "LoLDU: Эффективная настройка моделей с минимальным количеством параметров",
                    "desc": "Статья представляет новый метод эффективной настройки параметров (PEFT) под названием LoLDU. Этот подход использует LDU-разложение для инициализации матриц низкого ранга, что обеспечивает более быструю сходимость и ортогональность. LoLDU значительно сокращает количество обучаемых параметров по сравнению с обычными методами PEFT, сохраняя при этом сопоставимую производительность. Авторы провели обширные эксперименты на различных наборах данных и моделях, включая задачи обработки естественного языка и компьютерного зрения."
                },
                "en": {
                    "title": "LoLDU: Fine-Tuning with Fewer Parameters, Same Power",
                    "desc": "The paper introduces LoLDU, a new method for fine-tuning machine learning models that uses fewer parameters than traditional methods. LoLDU employs a technique called Lower-Diag-Upper Decomposition to improve the initialization of low-rank matrices, which helps in achieving faster convergence and better performance. This approach significantly reduces the number of trainable parameters by 2600 times compared to other parameter-efficient fine-tuning methods, without sacrificing accuracy. Extensive experiments across various datasets and model types demonstrate the effectiveness of LoLDU in maintaining performance while being more efficient."
                },
                "zh": {
                    "title": "LoLDU：参数高效微调的新突破",
                    "desc": "随着模型规模的快速增长，微调需要大量的计算资源。现有的方法如低秩适应（LoRA）试图解决全量微调中参数更新过大的问题，但其随机初始化可能导致收敛不佳和精度差距。我们提出了LoLDU，一种参数高效微调方法，通过下对角上分解（LDU）来初始化低秩矩阵，实现更快的收敛和正交性。实验表明，LoLDU在多个数据集上表现优异，参数量是所有PEFT方法中最少的。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.12781",
            "title": "Long-LRM: Long-sequence Large Reconstruction Model for Wide-coverage Gaussian Splats",
            "url": "https://huggingface.co/papers/2410.12781",
            "abstract": "We propose Long-LRM, a generalizable 3D Gaussian reconstruction model that is capable of reconstructing a large scene from a long sequence of input images. Specifically, our model can process 32 source images at 960x540 resolution within only 1.3 seconds on a single A100 80G GPU. Our architecture features a mixture of the recent Mamba2 blocks and the classical transformer blocks which allowed many more tokens to be processed than prior work, enhanced by efficient token merging and Gaussian pruning steps that balance between quality and efficiency. Unlike previous feed-forward models that are limited to processing 1~4 input images and can only reconstruct a small portion of a large scene, Long-LRM reconstructs the entire scene in a single feed-forward step. On large-scale scene datasets such as DL3DV-140 and Tanks and Temples, our method achieves performance comparable to optimization-based approaches while being two orders of magnitude more efficient. Project page: https://arthurhero.github.io/projects/llrm",
            "score": 5,
            "issue_id": 150,
            "pub_date": "2024-10-16",
            "pub_date_card": {
                "ru": "16 октября",
                "en": "October 16",
                "zh": "10月16日"
            },
            "hash": "0f75e8070394f973",
            "authors": [
                "Chen Ziwen",
                "Hao Tan",
                "Kai Zhang",
                "Sai Bi",
                "Fujun Luan",
                "Yicong Hong",
                "Li Fuxin",
                "Zexiang Xu"
            ],
            "affiliations": [
                "Adobe Research",
                "Oregon State University"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.12781.jpg",
            "data": {
                "categories": [
                    "#long_context",
                    "#benchmark",
                    "#inference",
                    "#optimization",
                    "#dataset",
                    "#3d",
                    "#architecture"
                ],
                "emoji": "🏙️",
                "ru": {
                    "title": "Быстрая 3D реконструкция больших сцен из множества изображений",
                    "desc": "Long-LRM - это обобщенная модель реконструкции 3D гауссовых распределений, способная восстанавливать большие сцены из длинных последовательностей изображений. Модель может обрабатывать 32 исходных изображения с разрешением 960x540 всего за 1,3 секунды на одном GPU A100 80G. Архитектура сочетает блоки Mamba2 и классические трансформеры, что позволяет обрабатывать гораздо больше токенов, чем предыдущие работы. Long-LRM реконструирует всю сцену за один проход, достигая производительности, сравнимой с оптимизационными подходами, но в 100 раз эффективнее."
                },
                "en": {
                    "title": "Revolutionizing 3D Scene Reconstruction with Speed and Efficiency",
                    "desc": "Long-LRM is a 3D Gaussian reconstruction model designed to handle large scenes from a sequence of images efficiently. It processes 32 high-resolution images in just 1.3 seconds using a single GPU, thanks to its innovative architecture combining Mamba2 and transformer blocks. The model uses token merging and Gaussian pruning to maintain a balance between quality and speed, allowing it to reconstruct entire scenes in one step. Long-LRM achieves results similar to optimization-based methods but is significantly faster, making it ideal for large-scale datasets."
                },
                "zh": {
                    "title": "高效重建：Long-LRM让大场景重建更简单",
                    "desc": "这篇论文介绍了一种名为Long-LRM的3D高斯重建模型，可以从长序列的输入图像中重建大型场景。该模型能够在单个A100 80G GPU上以1.3秒的速度处理32张960x540分辨率的源图像。其架构结合了最新的Mamba2模块和经典的Transformer模块，通过高效的令牌合并和高斯剪枝步骤，在质量和效率之间取得平衡。与之前只能处理1到4张图像的前馈模型不同，Long-LRM可以在一次前馈步骤中重建整个场景。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.09347",
            "title": "Toward Guidance-Free AR Visual Generation via Condition Contrastive Alignment",
            "url": "https://huggingface.co/papers/2410.09347",
            "abstract": "Classifier-Free Guidance (CFG) is a critical technique for enhancing the sample quality of visual generative models. However, in autoregressive (AR) multi-modal generation, CFG introduces design inconsistencies between language and visual content, contradicting the design philosophy of unifying different modalities for visual AR. Motivated by language model alignment methods, we propose Condition Contrastive Alignment (CCA) to facilitate guidance-free AR visual generation with high performance and analyze its theoretical connection with guided sampling methods. Unlike guidance methods that alter the sampling process to achieve the ideal sampling distribution, CCA directly fine-tunes pretrained models to fit the same distribution target. Experimental results show that CCA can significantly enhance the guidance-free performance of all tested models with just one epoch of fine-tuning (sim 1\\% of pretraining epochs) on the pretraining dataset, on par with guided sampling methods. This largely removes the need for guided sampling in AR visual generation and cuts the sampling cost by half. Moreover, by adjusting training parameters, CCA can achieve trade-offs between sample diversity and fidelity similar to CFG. This experimentally confirms the strong theoretical connection between language-targeted alignment and visual-targeted guidance methods, unifying two previously independent research fields. Code and model weights: https://github.com/thu-ml/CCA.",
            "score": 4,
            "issue_id": 150,
            "pub_date": "2024-10-12",
            "pub_date_card": {
                "ru": "12 октября",
                "en": "October 12",
                "zh": "10月12日"
            },
            "hash": "aea2fc78baf79a07",
            "authors": [
                "Huayu Chen",
                "Hang Su",
                "Peize Sun",
                "Jun Zhu"
            ],
            "affiliations": [
                "The University of Hong Kong",
                "Tsinghua University"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.09347.jpg",
            "data": {
                "categories": [
                    "#diffusion",
                    "#cv",
                    "#multimodal",
                    "#training",
                    "#open_source",
                    "#architecture",
                    "#alignment"
                ],
                "emoji": "🖼️",
                "ru": {
                    "title": "Улучшение генерации изображений без guidance через контрастное выравнивание",
                    "desc": "Статья представляет новый метод под названием Condition Contrastive Alignment (CCA) для улучшения качества генерации изображений в авторегрессионных мультимодальных моделях без использования техники Classifier-Free Guidance (CFG). CCA основан на методах выравнивания языковых моделей и напрямую дообучает предобученные модели для достижения желаемого распределения выборки. Экспериментальные результаты показывают, что CCA значительно улучшает производительность моделей без применения guidance, сокращая вычислительные затраты вдвое. Метод также позволяет достигать компромисса между разнообразием и точностью сэмплов, аналогично CFG."
                },
                "en": {
                    "title": "Unifying Visual and Language Models with CCA: A New Era of Efficient AR Generation",
                    "desc": "The paper introduces Condition Contrastive Alignment (CCA), a method designed to improve autoregressive visual generation without the need for classifier-free guidance. CCA fine-tunes pretrained models to match the desired sampling distribution, enhancing performance with minimal additional training. This approach reduces the reliance on guided sampling, effectively cutting sampling costs in half while maintaining high-quality outputs. The study demonstrates that CCA can balance sample diversity and fidelity, bridging the gap between language and visual model alignment techniques."
                },
                "zh": {
                    "title": "条件对比对齐：统一语言与视觉生成的创新方法",
                    "desc": "这篇论文介绍了一种名为条件对比对齐（CCA）的新方法，用于在自回归多模态生成中实现无指导的高性能视觉生成。传统的分类器自由指导（CFG）在语言和视觉内容之间引入了设计不一致，而CCA通过直接微调预训练模型来解决这个问题。实验结果表明，CCA可以在仅需一次微调的情况下显著提高模型的无指导性能，与指导采样方法相当。通过调整训练参数，CCA还可以在样本多样性和保真度之间实现类似CFG的平衡。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13060",
            "title": "AERO: Softmax-Only LLMs for Efficient Private Inference",
            "url": "https://huggingface.co/papers/2410.13060",
            "abstract": "The pervasiveness of proprietary language models has raised privacy concerns for users' sensitive data, emphasizing the need for private inference (PI), where inference is performed directly on encrypted inputs. However, current PI methods face prohibitively higher communication and latency overheads, primarily due to nonlinear operations. In this paper, we present a comprehensive analysis to understand the role of nonlinearities in transformer-based decoder-only language models. We introduce AERO, a four-step architectural optimization framework that refines the existing LLM architecture for efficient PI by systematically removing nonlinearities such as LayerNorm and GELU and reducing FLOPs counts. For the first time, we propose a Softmax-only architecture with significantly fewer FLOPs tailored for efficient PI. Furthermore, we devise a novel entropy regularization technique to improve the performance of Softmax-only models. AERO achieves up to 4.23times communication and 1.94times latency reduction. We validate the effectiveness of AERO by benchmarking it against the state-of-the-art.",
            "score": 4,
            "issue_id": 146,
            "pub_date": "2024-10-16",
            "pub_date_card": {
                "ru": "16 октября",
                "en": "October 16",
                "zh": "10月16日"
            },
            "hash": "f664135f7701d39c",
            "authors": [
                "Nandan Kumar Jha",
                "Brandon Reagen"
            ],
            "affiliations": [
                "New York University"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13060.jpg",
            "data": {
                "categories": [
                    "#benchmark",
                    "#inference",
                    "#optimization",
                    "#security",
                    "#architecture"
                ],
                "emoji": "🔒",
                "ru": {
                    "title": "AERO: Оптимизация архитектуры языковых моделей для приватного вывода",
                    "desc": "Статья представляет комплексный анализ роли нелинейных операций в трансформерных языковых моделях декодер-только архитектуры. Авторы предлагают AERO - четырехэтапную структуру оптимизации архитектуры, которая систематически удаляет нелинейности, такие как LayerNorm и GELU, и сокращает количество FLOP. Впервые предложена архитектура только с Softmax, специально разработанная для эффективного частного вывода (PI). Представлен новый метод регуляризации энтропии для улучшения производительности моделей только с Softmax."
                },
                "en": {
                    "title": "Streamlining Privacy: AERO's Leap in Efficient Language Model Inference",
                    "desc": "The paper addresses privacy concerns in language models by focusing on private inference, which allows computations on encrypted data. Current methods struggle with high communication and latency due to nonlinear operations. The authors introduce AERO, an optimization framework that removes nonlinearities like LayerNorm and GELU, and proposes a Softmax-only architecture to reduce computational demands. AERO significantly improves efficiency, achieving notable reductions in communication and latency, and is validated against existing methods."
                },
                "zh": {
                    "title": "AERO：提升隐私推理效率的新框架",
                    "desc": "这篇论文讨论了在加密输入上进行推理的隐私推理（PI）问题，并提出了一种名为AERO的优化框架。AERO通过去除非线性操作如LayerNorm和GELU，减少计算量，从而提高PI的效率。作者首次提出了一种仅使用Softmax的架构，显著减少了计算量，并引入了一种新的熵正则化技术来提升模型性能。实验结果表明，AERO在通信和延迟方面分别减少了4.23倍和1.94倍。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.10210",
            "title": "Minimum Tuning to Unlock Long Output from LLMs with High Quality Data as the Key",
            "url": "https://huggingface.co/papers/2410.10210",
            "abstract": "As large language models rapidly evolve to support longer context, there is a notable disparity in their capability to generate output at greater lengths. Recent study suggests that the primary cause for this imbalance may arise from the lack of data with long-output during alignment training. In light of this observation, attempts are made to re-align foundation models with data that fills the gap, which result in models capable of generating lengthy output when instructed. In this paper, we explore the impact of data-quality in tuning a model for long output, and the possibility of doing so from the starting points of human-aligned (instruct or chat) models. With careful data curation, we show that it possible to achieve similar performance improvement in our tuned models, with only a small fraction of training data instances and compute. In addition, we assess the generalizability of such approaches by applying our tuning-recipes to several models. our findings suggest that, while capacities for generating long output vary across different models out-of-the-box, our approach to tune them with high-quality data using lite compute, consistently yields notable improvement across all models we experimented on. We have made public our curated dataset for tuning long-writing capability, the implementations of model tuning and evaluation, as well as the fine-tuned models, all of which can be openly-accessed.",
            "score": 3,
            "issue_id": 150,
            "pub_date": "2024-10-14",
            "pub_date_card": {
                "ru": "14 октября",
                "en": "October 14",
                "zh": "10月14日"
            },
            "hash": "2fa998e2c3dc73e1",
            "authors": [
                "Yingda Chen",
                "Xingjun Wang",
                "Jintao Huang",
                "Yunlin Mao",
                "Daoze Zhang",
                "Yuze Zhao"
            ],
            "affiliations": [
                "ModelScope Team, Alibaba Group"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.10210.jpg",
            "data": {
                "categories": [
                    "#small_models",
                    "#synthetic",
                    "#data",
                    "#training",
                    "#dataset",
                    "#open_source",
                    "#long_context",
                    "#alignment"
                ],
                "emoji": "📝",
                "ru": {
                    "title": "Длинные ответы для больших языковых моделей: качество важнее количества",
                    "desc": "В статье исследуется проблема неспособности больших языковых моделей генерировать длинные тексты. Авторы предлагают метод дообучения моделей на специально подготовленном наборе данных с длинными ответами. Эксперименты показывают, что даже небольшое количество качественных данных позволяет значительно улучшить способность моделей к генерации длинных текстов. Исследователи публикуют свой набор данных, код и дообученные модели для открытого доступа."
                },
                "en": {
                    "title": "Unlocking Long-Form Language: Tuning Models for Extended Output",
                    "desc": "The paper discusses how large language models often struggle to generate long outputs due to insufficient training data that includes lengthy examples. By re-aligning these models with carefully curated data that emphasizes long outputs, the researchers were able to enhance the models' ability to produce extended text. They found that even with a small amount of high-quality data and minimal computational resources, significant improvements could be achieved. The study also demonstrated that this approach works across various models, suggesting a generalizable method for improving long-output generation."
                },
                "zh": {
                    "title": "用高质量数据提升长文本生成能力",
                    "desc": "这篇论文探讨了大语言模型在生成长文本时能力不均衡的问题，主要原因是训练时缺乏长输出的数据。研究通过重新调整基础模型的数据，使其能够生成长文本。通过精心的数据选择，研究表明只需少量数据和计算资源就能显著提高模型性能。研究还展示了这种方法在不同模型上的普遍适用性，并公开了相关数据集和模型。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.12183",
            "title": "TransAgent: Transfer Vision-Language Foundation Models with Heterogeneous Agent Collaboration",
            "url": "https://huggingface.co/papers/2410.12183",
            "abstract": "Vision-language foundation models (such as CLIP) have recently shown their power in transfer learning, owing to large-scale image-text pre-training. However, target domain data in the downstream tasks can be highly different from the pre-training phase, which makes it hard for such a single model to generalize well. Alternatively, there exists a wide range of expert models that contain diversified vision and/or language knowledge pre-trained on different modalities, tasks, networks, and datasets. Unfortunately, these models are \"isolated agents\" with heterogeneous structures, and how to integrate their knowledge for generalizing CLIP-like models has not been fully explored. To bridge this gap, we propose a general and concise TransAgent framework, which transports the knowledge of the isolated agents in a unified manner, and effectively guides CLIP to generalize with multi-source knowledge distillation. With such a distinct framework, we flexibly collaborate with 11 heterogeneous agents to empower vision-language foundation models, without further cost in the inference phase. Finally, our TransAgent achieves state-of-the-art performance on 11 visual recognition datasets. Under the same low-shot setting, it outperforms the popular CoOp with around 10% on average, and 20% on EuroSAT which contains large domain shifts.",
            "score": 3,
            "issue_id": 150,
            "pub_date": "2024-10-16",
            "pub_date_card": {
                "ru": "16 октября",
                "en": "October 16",
                "zh": "10月16日"
            },
            "hash": "8e00415319fa6d49",
            "authors": [
                "Yiwei Guo",
                "Shaobin Zhuang",
                "Kunchang Li",
                "Yu Qiao",
                "Yali Wang"
            ],
            "affiliations": [
                "Shanghai AI Laboratory",
                "Shanghai Jiao Tong University",
                "Shenzhen Institute of Advanced Technology, Chinese Academy of Sciences",
                "University of Chinese Academy of Sciences"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.12183.jpg",
            "data": {
                "categories": [
                    "#benchmark",
                    "#cv",
                    "#graphs",
                    "#multimodal",
                    "#training",
                    "#transfer_learning",
                    "#agents",
                    "#architecture"
                ],
                "emoji": "🧠",
                "ru": {
                    "title": "TransAgent: объединяем знания экспертов для лучшего обобщения мультимодальных моделей",
                    "desc": "Статья представляет TransAgent - фреймворк для улучшения обобщающей способности моделей компьютерного зрения и обработки естественного языка типа CLIP. TransAgent объединяет знания из различных экспертных моделей через мультимодальную дистилляцию знаний. Это позволяет эффективно адаптировать CLIP-подобные модели к новым доменам данных. Авторы демонстрируют значительное улучшение производительности на 11 наборах данных по распознаванию изображений, особенно в условиях ограниченного количества обучающих примеров."
                },
                "en": {
                    "title": "\"Unifying Isolated Experts: TransAgent's Leap in Vision-Language Models\"",
                    "desc": "The paper introduces TransAgent, a framework designed to enhance vision-language models like CLIP by integrating knowledge from various expert models. These expert models, pre-trained on different tasks and datasets, are typically isolated and have diverse structures. TransAgent unifies their knowledge through multi-source knowledge distillation, allowing CLIP to generalize better across different domains. This approach significantly improves performance on multiple visual recognition tasks, outperforming existing methods like CoOp, especially in scenarios with large domain shifts."
                },
                "zh": {
                    "title": "TransAgent：整合多源知识，提升视觉-语言模型泛化能力",
                    "desc": "这篇论文介绍了一种名为TransAgent的框架，用于整合不同专家模型的知识来增强视觉-语言基础模型的泛化能力。TransAgent通过多源知识蒸馏的方式，将孤立的专家模型的知识统一传输给CLIP等模型。该框架能够在不增加推理阶段成本的情况下，与11个异构代理灵活协作。最终，TransAgent在11个视觉识别数据集上实现了最先进的性能，尤其在EuroSAT数据集上表现出色。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2410.13293",
            "title": "SBI-RAG: Enhancing Math Word Problem Solving for Students through Schema-Based Instruction and Retrieval-Augmented Generation",
            "url": "https://huggingface.co/papers/2410.13293",
            "abstract": "Many students struggle with math word problems (MWPs), often finding it difficult to identify key information and select the appropriate mathematical operations.Schema-based instruction (SBI) is an evidence-based strategy that helps students categorize problems based on their structure, improving problem-solving accuracy. Building on this, we propose a Schema-Based Instruction Retrieval-Augmented Generation (SBI-RAG) framework that incorporates a large language model (LLM).Our approach emphasizes step-by-step reasoning by leveraging schemas to guide solution generation. We evaluate its performance on the GSM8K dataset, comparing it with GPT-4 and GPT-3.5 Turbo, and introduce a \"reasoning score\" metric to assess solution quality. Our findings suggest that SBI-RAG enhances reasoning clarity and problem-solving accuracy, potentially providing educational benefits for students",
            "score": 2,
            "issue_id": 148,
            "pub_date": "2024-10-17",
            "pub_date_card": {
                "ru": "17 октября",
                "en": "October 17",
                "zh": "10月17日"
            },
            "hash": "edc83779c70890eb",
            "authors": [
                "Prakhar Dixit",
                "Tim Oates"
            ],
            "affiliations": [
                "Department of Computer Science University of Maryland Baltimore County"
            ],
            "pdf_title_img": "assets\\pdf\\title_img\\2410.13293.jpg",
            "data": {
                "categories": [
                    "#rag",
                    "#reasoning",
                    "#benchmark",
                    "#math",
                    "#dataset",
                    "#education"
                ],
                "emoji": "🧮",
                "ru": {
                    "title": "Схемы + ИИ = Лучшее решение математических задач",
                    "desc": "Авторы предлагают новый подход к решению математических текстовых задач, называемый SBI-RAG (Schema-Based Instruction Retrieval-Augmented Generation). Этот метод сочетает в себе схемо-ориентированное обучение (SBI) и большую языковую модель (LLM) для пошагового рассуждения при решении задач. Авторы оценивают производительность SBI-RAG на наборе данных GSM8K, сравнивая его с GPT-4 и GPT-3.5 Turbo. Результаты показывают, что SBI-RAG улучшает ясность рассуждений и точность решения задач, что потенциально может принести пользу учащимся."
                },
                "en": {
                    "title": "Unlocking Math Word Problems with Schema-Based AI",
                    "desc": "The paper introduces a new framework called Schema-Based Instruction Retrieval-Augmented Generation (SBI-RAG) to help students solve math word problems more effectively. This approach uses a large language model to guide students through step-by-step reasoning by categorizing problems based on their structure. The framework was tested on the GSM8K dataset and compared with existing models like GPT-4 and GPT-3.5 Turbo, using a new 'reasoning score' metric to evaluate solution quality. Results indicate that SBI-RAG improves both the clarity of reasoning and the accuracy of problem-solving, offering potential educational benefits."
                },
                "zh": {
                    "title": "结构引导，提升解题能力",
                    "desc": "许多学生在解决数学文字题时感到困难，常常难以识别关键信息并选择合适的数学运算。基于结构的教学（SBI）是一种基于证据的策略，帮助学生根据问题的结构进行分类，从而提高解题准确性。我们提出了一种结合大语言模型（LLM）的基于结构的教学检索增强生成（SBI-RAG）框架，强调通过利用结构指导解决方案生成的逐步推理。研究表明，SBI-RAG可以提高推理的清晰度和解题的准确性，为学生提供潜在的教育益处。"
                }
            }
        }
    ],
    "link_prev": "2024-10-17.html",
    "link_next": "2024-10-21.html",
    "link_month": "2024-10.html",
    "short_date_prev": {
        "ru": "17.10",
        "en": "10/17",
        "zh": "10月17日"
    },
    "short_date_next": {
        "ru": "21.10",
        "en": "10/21",
        "zh": "10月21日"
    },
    "categories": {
        "#dataset": 12,
        "#data": 7,
        "#benchmark": 17,
        "#agents": 2,
        "#cv": 10,
        "#rl": 1,
        "#rlhf": 3,
        "#rag": 5,
        "#plp": 2,
        "#inference": 9,
        "#3d": 2,
        "#audio": 4,
        "#video": 4,
        "#multimodal": 15,
        "#math": 3,
        "#multilingual": 3,
        "#architecture": 22,
        "#healthcare": 2,
        "#training": 19,
        "#robotics": 0,
        "#agi": 4,
        "#games": 8,
        "#interpretability": 1,
        "#reasoning": 6,
        "#transfer_learning": 7,
        "#graphs": 3,
        "#ethics": 1,
        "#security": 3,
        "#optimization": 16,
        "#survey": 2,
        "#diffusion": 6,
        "#alignment": 10,
        "#story_generation": 0,
        "#hallucinations": 1,
        "#long_context": 2,
        "#synthetic": 6,
        "#machine_translation": 0,
        "#leakage": 0,
        "#open_source": 12,
        "#small_models": 4,
        "#science": 5,
        "#low_resource": 1,
        "#education": 1
    }
}