@inbook{8d034bcd7b2f470b843f1fd8ea5ad246,
title = "Variational convolutional networks for human-centric annotations",
abstract = "To model how a human would annotate an image is an important and interesting task relevant to image captioning. Its main challenge is that a same visual concept may be important in some images but becomes less salient in other situations. Further, the subjective viewpoints of a human annotator also play a crucial role in finalizing the annotations. To deal with such high variability, we introduce a new deep net model that integrates a CNN with a variational auto-encoder (VAE). With the latent features embedded in a VAE, it becomes more flexible to tackle the uncertainly of human-centric annotations. On the other hand, the supervised generalization further enables the discriminative power of the generative VAE model. The resulting model can be end-to-end fine-tuned to further improve the performance on predicting visual concepts. The provided experimental results show that our method is state-of-the-art over two benchmark datasets: MS COCO and Flickr30K, producing mAP of 36.6 and 23.49, and PHR (Precision at Human Recall) of 49.9 and 32.04, respectively.",
author = "Ke, {Tsung Wei} and Lin, {Che Wei} and Liu, {Tyng Luh} and Davi Geiger",
note = "Publisher Copyright: {\textcopyright} Springer International Publishing AG 2017.",
year = "2017",
doi = "10.1007/978-3-319-54190-7_8",
language = "English (US)",
isbn = "9783319541891",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "120--135",
editor = "Ko Nishino and Shang-Hong Lai and Vincent Lepetit and Yoichi Sato",
booktitle = "Computer Vision - 13th Asian Conference on Computer Vision, ACCV 2016, Revised Selected Papers",
}