@inproceedings{cf34fb65f6e84e82a443a57aa15220bd,
title = "Real-time lip tracking and bimodal continuous speech recognition",
abstract = "We investigate using a bimodal approach to speech recognition by incorporating additional visual features derived from lip movement of the speaker. A reference contour model is used to track the lip outline of the speaker. By using color, constraining the deformation in an affine subspace, and by incorporating an outlier rejection mechanism, our system is robust and runs in real time. To address the model initialization issue, a fast lip localization algorithm is also incorporated. A sample of continuous bimodal speech data based on a confined vocabulary (useful for our application area) was synchronously captured for training and testing. Using the hidden Markov modeling framework, we trained our bimodal context-dependent sub-word-based recognizer in a few different ways. The experiments show that the bimodal recognizer compares favorably to the acoustic-only counterpart. The results also indicate that it is advantageous to include first derivatives of the visual features. Furthermore, the 2-stream modeling scheme appears to be preferable to the 1-stream case for bimodal speech.",
author = "Chan, \{M. T.\} and Y. Zhang and Huang, \{T. S.\}",
note = "This work is supported by the U.S. Army Research Laboratory under the Federated Laboratory Program, Cooperative Agreement DAAL01-96-2-0003,; 2nd IEEE Workshop on Multimedia Signal Processing, MMSP 1998 ; Conference date: 07-12-1998 Through 09-12-1998",
year = "1998",
doi = "10.1109/MMSP.1998.738914",
language = "English (US)",
series = "1998 IEEE 2nd Workshop on Multimedia Signal Processing",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "65--70",
editor = "Abeer Alwan and Antonio Ortega and \{Jay Kuo\}, C.-C. and \{Max Nikias\}, C.L. and Wong, \{Ping Wah\}",
booktitle = "1998 IEEE 2nd Workshop on Multimedia Signal Processing",
address = "United States",
}