@inproceedings{17f44852ccac495c9b4ed7b58a5b6024,
title = "Learning temporal structures for human activity recognition",
abstract = "We propose a hierarchical method for learning temporal structures for the recognition of complex human activities or actions in videos. Low level features (HOG, HOF, MBHx and MBHy) are first computed from video snippets to form concatenated feature vectors. A novel segmentation algorithm based on K-means clustering is then used to divide the video into segments, with each segment corresponding to a sub-action with uniform motion characteristics. Using low level features as inputs, a many-to-one encoder is trained to extract generalized features for the snippets in each segment. A second many-to-one encoder is then used to compute higher-level features from the generalized features. The higher-level features from individual segments are then concatenated together and used to train a third many-to-one encoder to extract a high-level feature representation for the entire video. The final descriptor is the concatenation of higher-level features from individual segments and the high-level feature for the entire video. Using the proposed descriptor and a mutli-class linear support vector machine (SVM), we achieved state-of-the-art results on datasets Olympic Sports and UCF50, and beat the state-of-the-art result on the challenging HMD51 dataset by a wide margin of 17%.",
author = "Tiantian Xu and Wong, {Edward K.}",
note = "Publisher Copyright: {\textcopyright} 2017. The copyright of this document resides with its authors.; 28th British Machine Vision Conference, BMVC 2017 ; Conference date: 04-09-2017 Through 07-09-2017",
year = "2017",
doi = "10.5244/c.31.160",
language = "English (US)",
series = "British Machine Vision Conference 2017, BMVC 2017",
publisher = "BMVA Press",
booktitle = "British Machine Vision Conference 2017, BMVC 2017",
}