@inproceedings{11ebf57b44ad4f0083666da078246377,
title = "Convolutional learning of spatio-temporal features",
abstract = "We address the problem of learning good features for understanding video data. We introduce a model that learns latent representations of image sequences from pairs of successive images. The convolutional architecture of our model allows it to scale to realistic image sizes whilst using a compact parametrization. In experiments on the NORB dataset, we show our model extracts latent {"}flow fields{"} which correspond to the transformation between the pair of input frames. We also use our model to extract low-level motion features in a multi-stage architecture for action recognition, demonstrating competitive performance on both the KTH and Hollywood2 datasets.",
keywords = "activity recognition, convolutional nets, optical flow, restricted Boltzmann machines, unsupervised learning, video analysis",
author = "Taylor, {Graham W.} and Rob Fergus and Yann LeCun and Christoph Bregler",
note = "Copyright: Copyright 2019 Elsevier B.V., All rights reserved.; 11th European Conference on Computer Vision, ECCV 2010 ; Conference date: 10-09-2010 Through 11-09-2010",
year = "2010",
doi = "10.1007/978-3-642-15567-3_11",
language = "English (US)",
isbn = "3642155669",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
number = "PART 6",
pages = "140--153",
booktitle = "Computer Vision, ECCV 2010 - 11th European Conference on Computer Vision, Proceedings",
edition = "PART 6",
}