@inproceedings{70eb9aab423e4c898cf0a6b0283e73f0,
title = "Altogether: Image Captioning via Re-aligning Alt-text",
abstract = "This paper focuses on creating synthetic data to improve the quality of image captions.Existing works typically have two shortcomings.First, they caption images from scratch, ignoring existing alt-text metadata, and second, lack transparency if the captioners' training data (e.g.GPT) is unknown.In this paper, we study a principled approach Altogether based on the key idea to edit and re-align existing alt-texts associated with the images.To generate training data, we perform human annotation where annotators start with the existing alt-text and realign it to the image content in multiple rounds, consequently constructing captions with rich visual concepts.This differs from prior work that carries out human annotation as a one-time description task solely based on images and annotator knowledge.We train a captioner on this data that generalizes the process of realigning alt-texts at scale.Our results show our Altogether approach leads to richer image captions that also improve text-to-image generation and zero-shot image classification tasks.",
author = "Hu Xu and Huang, {Po Yao} and Tan, {Xiaoqing Ellen} and Yeh, {Ching Feng} and Jacob Kahn and Christine Jou and Gargi Ghosh and Omer Levy and Luke Zettlemoyer and Yih, {Wen Tau} and Li, {Shang Wen} and Saining Xie and Christoph Feichtenhofer",
note = "Publisher Copyright: {\textcopyright} 2024 Association for Computational Linguistics.; 2024 Conference on Empirical Methods in Natural Language Processing, EMNLP 2024 ; Conference date: 12-11-2024 Through 16-11-2024",
year = "2024",
doi = "10.18653/v1/2024.emnlp-main.1075",
language = "English (US)",
series = "EMNLP 2024 - 2024 Conference on Empirical Methods in Natural Language Processing, Proceedings of the Conference",
publisher = "Association for Computational Linguistics (ACL)",
pages = "19302--19318",
editor = "Yaser Al-Onaizan and Mohit Bansal and Yun-Nung Chen",
booktitle = "EMNLP 2024 - 2024 Conference on Empirical Methods in Natural Language Processing, Proceedings of the Conference",
}