@inproceedings{b13f0fba99e048d49ef1bfdb4f9a93df,
title = "Enhancing video summarization via vision-language embedding",
abstract = "This paper addresses video summarization, or the problem of distilling a raw video into a shorter form while still capturing the original story. We show that visual representations supervised by freeform language make a good fit for this application by extending a recent submodular summarization approach [9] with representativeness and interestingness objectives computed on features from a joint vision-language embedding space. We perform an evaluation on two diverse datasets, UT Egocentric [18] and TV Episodes [45], and show that our new objectives give improved summarization ability compared to standard visual features alone. Our experiments also show that the vision-language embedding need not be trained on domain-specific data, but can be learned from standard still image vision-language datasets and transferred to video. A further benefit of our model is the ability to guide a summary using freeform text input at test time, allowing user customization.",
author = "Plummer, {Bryan A.} and Matthew Brown and Svetlana Lazebnik",
year = "2017",
month = nov,
day = "6",
doi = "10.1109/CVPR.2017.118",
language = "English (US)",
series = "Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "1052--1060",
booktitle = "Proceedings - 30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017",
address = "United States",
note = "30th IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017 ; Conference date: 21-07-2017 Through 26-07-2017",
}