@inproceedings{ba694d663d284f79b1d5f3152fcfd670,
title = "URBAN SOUND & SIGHT: DATASET AND BENCHMARK FOR AUDIO-VISUAL URBAN SCENE UNDERSTANDING",
abstract = "Automatic audio-visual urban traffic understanding is a growing area of research with many potential applications of value to industry, academia, and the public sector. Yet, the lack of well-curated resources for training and evaluating models to research in this area hinders their development. To address this we present a curated audio-visual dataset, Urban Sound & Sight (Urbansas), developed for investigating the detection and localization of sounding vehicles in the wild. Urbansas consists of 12 hours of unlabeled data along with 3 hours of manually annotated data, including bounding boxes with classes and unique id of vehicles, and strong audio labels featuring vehicle types and indicating off-screen sounds. We discuss the challenges presented by the dataset and how to use its annotations for the localization of vehicles in the wild through audio models.",
keywords = "audio-visual, dataset, traffic, urban research",
author = "Magdalena Fuentes and Bea Steers and Pablo Zinemanas and Mart{\'i}n Rocamora and Luca Bondi and Julia Wilkins and Qianyi Shi and Yao Hou and Samarjit Das and Xavier Serra and Bello, {Juan Pablo}",
note = "Funding Information: Work partially supported by NSF award 1955357 and Bosch RTC. Publisher Copyright: {\textcopyright} 2022 IEEE; 47th IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2022 ; Conference date: 23-05-2022 Through 27-05-2022",
year = "2022",
doi = "10.1109/ICASSP43922.2022.9747644",
language = "English (US)",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "141--145",
booktitle = "2022 IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2022 - Proceedings",
}