# im2-publications.bib

@inproceedings{Drygajlo7,
title = {Static models of derivative-coordinates phase spaces for multivariate time series classification: an application to signature verification},
author = {Richiardi, J.  and Drygajlo, A.  and Kryszczuk, K. },
journal = {3rd International Conference on Biometrics (ICB 2009), Lecture Notes in Computer Science},
year = {2009},
pages = {140--149},
keywords = {IM2.MPR, Report_VIII}
}

@techreport{Humm20079,
title = {Database and evaluation protocols for user authentication using combined handwriting and speech modalities},
author = {Humm, A.  and Hennebert, J.  and Ingold, R. },
year = {2007},
institution = {Department of Informatics, University of Fribourg, Switzerland},
keywords = {Report_VII, IM2.HMI}
}

@article{lathoud07a,
title = {Short-term spatio-temporal clustering applied to multiple moving speakers},
author = {Lathoud, G.  and Odobez, J. -M. },
journal = {IEEE Transactions on Audio, Speech and Language Processing},
year = {2007},
keywords = {Report_VI, IM2.AP.MPR, joint publication},
abstract = {Distant microphones permit to process spontaneous multi-party speech with very little constraints on speakers, as opposed to close-talking microphones. Minimizing the constraints on speakers permits a large diversity of applications, including meeting summarization and browsing, surveillance, hearing aids, and more natural human-machine interaction. Such applications of distant microphones require to determine where and when the speakers are talking. This is inherently a multisource problem, because of background noise sources, as well as the natural tendency of multiple speakers to talk over each other. Moreover, spontaneous speech utterances are highly discontinuous, which makes difficult to track the multiple speakers with classical filtering approaches, such as Kalman Filtering of Particle Filters. As an alternative, this paper proposes a probabilistic framework to determine the trajectories of multiple moving speakers in the short-term only -- i.e. only while they speak. Instantaneous location estimates that are close in space and time are grouped into short-term clusters'' in a principled manner. Each short-term cluster determines the precise start and end times of an utterance, and a short-term spatial trajectory. Contrastive experiments clearly show the benefit of using short-term clustering, on real indoor recordings with seated speakers in meetings, as well as multiple moving speakers.},
ipdmembership = {speech vision lathoud odobez}
}

@phdthesis{pozd-thesis,
title = {Prior knowledge in kernel methods},
author = {Pozdnoukhov, A. },
year = {2006},
school = {\'Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {PhD Thesis #3606 at the \'Ecole Polytechnique F\'ed\'erale de Lausanne (IDIAP-RR 06-66)},
keywords = {Report_VI, IM2.MPR},
abstract = {Machine Learning is a modern and actively developing field of computer science, devoted to extracting and estimating dependencies from empirical data. It combines such fields as statistics, optimization theory and artificial intelligence. In practical tasks, the general aim of Machine Learning is to construct algorithms able to generalize and predict in previously unseen situations based on some set of examples. Given some finite information, Machine Learning provides ways to exract knowledge, describe, explain and predict from data. Kernel Methods are one of the most successful branches of Machine Learning. They allow applying linear algorithms with well-founded properties such as generalization ability, to non-linear real-life problems. Support Vector Machine is a well-known example of a kernel method, which has found a wide range of applications in data analysis nowadays. In many practical applications, some additional prior knowledge is often available. This can be the knowledge about the data domain, invariant transformations, inner geometrical structures in data, some properties of the underlying process, etc. If used smartly, this information can provide significant improvement to any data processing algorithm. Thus, it is important to develop methods for incorporating prior knowledge into data-dependent models. The main objective of this thesis is to investigate approaches towards learning with kernel methods using prior knowledge. Invariant learning with kernel methods is considered in more details. In the first part of the thesis, kernels are developed which incorporate prior knowledge on invariant transformations. They apply when the desired transformation produce an object around every example, assuming that all points in the given object share the same class. Different types of objects, including hard geometrical objects and distributions are considered. These kernels were then applied for images classification with Support Vector Machines. Next, algorithms which specifically include prior knowledge are considered. An algorithm which linearly classifies distributions by their domain was developed. It is constructed such that it allows to apply kernels to solve non-linear tasks. Thus, it combines the discriminative power of support vector machines and the well-developed framework of generative models. It can be applied to a number of real-life tasks which include data represented as distributions. In the last part of the thesis, the use of unlabelled data as a source of prior knowledge is considered. The technique of modelling the unlabelled data with a graph is taken as a baseline from semi-supervised manifold learning. For classification problems, we use this apporach for building graph models of invariant manifolds. For regression problems, we use unlabelled data to take into account the inner geometry of the input space. To conclude, in this thesis we developed a number of approaches for incorporating some prior knowledge into kernel methods. We proposed invariant kernels for existing algorithms, developed new algorithms and adapted a technique taken from semi-supervised learning for invariant learning. In all these cases, links with related state-of-the-art approaches were investigated. Several illustrative experiments were carried out on real data on optical character recognition, face image classification, brain-computer interfaces, and a number of benchmark and synthetic datasets.},
ipdmembership = {learning},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/pozd_thesis_06-66.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/pozd_thesis_06-66.ps.gz}
}

@article{Valente_IEEE-SIGNALPROCESSINGLETTERS_2009,
title = {A Novel Criterion for Classifiers Combination in Multistream Speech Recognition},
author = {Valente, F. },
journal = {IEEE Signal Processing Letters},
year = {2009},
volume = {16},
number = {7},
pages = {561--564},
issn = {1070-9908},
doi = {10.1109/lsp.2009.2019779},
keywords = {IM2.AP, Report_VIII},
abstract = {In this paper we propose a novel information theoretic criterion for optimizing the linear combination of classifiers in multi stream automatic speech recognition. We discuss an objective function that achieves a trade-off between the minimization of a bound on the Bayes probability of error and the minimization of the divergence between the individual classifier outputs and their combination. The method is compared with the conventional inverse entropy and minimum entropy combinations on both small and large vocabulary automatic speech recognition tasks. Results reveal that it outperforms other linear combination rules. Furthermore we discuss the advantages of the proposed approach and the extension to other (non-linear) combination rules.},
projects = {Idiap},
}

@inproceedings{Koval:2007:Sphere-Decoding,
title = {Analysis of physical unclonable identification based on reference list decoding},
author = {Koval, O.  and Voloshynovskiy, S.  and Beekhof, F.  and Pun, T. },
editor = {Delp III, E. J.  and Wong, P. W.  and Dittmann, J.  and Memon, N. D. },
booktitle = {Steganography, and Watermarking of Multimedia Contents X},
series = {Proceedings of SPIE},
year = {2008},
volume = {6819},
address = {(SPIE, Bellingham, WA 2008) 68190B},
keywords = {Report_VII, IM2.MPR}
}

@inproceedings{JayagopiACM08,
title = {Predicting the dominant clique in meetings through fusion of nonverbal cues},
author = {Jayagopi, D. },
booktitle = {Proc. ACM Vancouver, Canada},
year = {2008},
keywords = {IM2.MPR, Report_VIII}
}

@techreport{galan:rr07-23,
title = {Feature extraction for multi-class bci using canonical variates analysis},
author = {Galán, F.  and Ferrez, P. W.  and Oliva, F.  and Gu\ardia, J.  and del R. Millán, J. },
year = {2007},
type = {IDIAP-RR},
number = {23},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {Report_VI, IM2.BMI},
abstract = {emphObjective: To propose a new feature extraction method with canonical solution for multi-class Brain-Computer Interfaces (BCI). The proposed method should provide a reduced number of canonical discriminant spatial patterns (CDSP) and rank the channels sorted by power discriminability (DP) between classes. emphMethods: The feature extractor relays in Canonical Variates Analysis (CVA) which provides the CDSP between the classes. The number of CDSP is equal to the number of classes minus one. We analyze EEG data recorded with 64 electrodes from 4 subjects recorded in 20 sessions. They were asked to execute twice in each session three different mental tasks (left hand imagination movement, rest, and words association) during 7 seconds. A ranking of electrodes sorted by power discriminability between classes and the CDSP were computed. After splitting data in training and test sets, we compared the classification accuracy achieved by Linear Discriminant Analysis (LDA) in frequency and temporal domains. emphResults: The average LDA classification accuracies over the four subjects using CVA on both domains are equivalent (57.89\% in frequency domain and 59.43\% in temporal domain). These results, in terms of classification accuracies, are also reflected in the similarity between the ranking of relevant channels in both domains. emphConclusions: CVA is a simple feature extractor with canonical solution useful for multi-class BCI applications that can work on temporal or frequency domain.},
ipdmembership = {learning},
pdf = {ftp://ftp.idiap.ch/pub/reports/2007/galan-idiap-rr-07-23.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2007/galan-idiap-rr-07-23.ps.gz}
}

@inproceedings{ganapathy:interspeech:2008,
title = {Spectral noise shaping: improvements in speech/audio codec based on linear prediction in spectral domain},
author = {Ganapathy, S.  and Motlicek, P.  and Hermansky, H.  and Garudadri, H. },
crossref = {ganapathy:rr08-16},
booktitle = {INTERSPEECH 2008},
year = {2008},
location = {Brisbane, Australia},
note = {IDIAP-RR 08-16},
keywords = {IM2.AP, Report_VII},
abstract = {Audio coding based on Frequency Domain Linear Prediction (FDLP) uses auto-regressive models to approximate Hilbert envelopes in frequency sub-bands. Although the basic technique achieves good coding efficiency, there is a need to improve the reconstructed signal quality for tonal signals with impulsive spectral content. For such signals, the quantization noise in the FDLP codec appears as frequency components not present in the input signal. In this paper, we propose a technique of Spectral Noise Shaping (SNS) for improving the quality of tonal signals by applying a Time Domain Linear Prediction (TDLP) filter prior to the FDLP processing. The inverse TDLP filter at the decoder shapes the quantization noise to reduce the artifacts. Application of the SNS technique to the FDLP codec improves the quality of the tonal signals without affecting the bit-rate. Performance evaluation is done with Perceptual Evaluation of Audio Quality (PEAQ) scores and with subjective listening tests.}
}

@article{Pantic_IEEESPM_2009,
title = {Implicit Human Centered Tagging},
author = {Pantic, M.  and Vinciarelli, A. },
journal = {IEEE Signal Processing Magazine},
year = {2009},
volume = {26},
keywords = {IM2.MCA, Report_VIII},
projects = {SSPNet},
}

@inproceedings{liwicki08writer,
title = {Writer-dependent recognition of handwritten whiteboard notes in smart meeting room environments},
author = {Liwicki, M.  and Schlapbach, A.  and Bunke, H. },
booktitle = {Proc. 8th IAPR Int. Workshop on Document Analysis Systems},
year = {2008},
pages = {151--157},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@inproceedings{Boakye2008,
title = {Two's a crowd: improving speaker diarization by automatically identifying and excluding overlapped speech},
author = {Boakye, K.  and Vinyals, O.  and Friedland, G. },
booktitle = {Interspeech},
year = {2008},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@article{Stolcke2007,
title = {Speaker recognition with session variability normalization based on mllr adaptation transforms},
author = {Stolcke, A.  and Kajarekar, S.  and Ferrer, L.  and Shriberg, E. },
journal = {IEEE Transactions on Audio, Speech, and Language Processing},
year = {2007},
volume = {15},
pages = {1987--1998},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@inproceedings{pgarner:interspeech:2008,
title = {Silence models in weighted finite-state transducers},
author = {Garner, P. N. },
crossref = {pgarner:rr08-19},
booktitle = {Interspeech},
year = {2008},
location = {Brisbane, Australia},
note = {IDIAP-RR 08-19},
keywords = {IM2.AP, Report_VII},
abstract = {We investigate the effects of different silence modelling strategies in Weighted Finite-State Transducers for Automatic Speech Recognition. We show that the choice of silence models, and the way they are included in the transducer, can have a significant effect on the size of the resulting transducer; we present a means to prevent particularly large silence overheads. Our conclusions include that context-free silence modelling fits well with transducer based grammars, whereas modelling silence as a monophone and a context has larger overheads.}
}

@inproceedings{ba:clear:2007,
title = {Probabilistic head pose tracking evaluation in single and multiple camera setups},
author = {Ba, S.  and Odobez, J. -M. },
booktitle = {Classification of Events, Activities and Relationship Evaluation and Workshop},
year = {2007},
note = {IDIAP-RR 07-21},
keywords = {Report_VI, IM2.VP},
abstract = {This paper presents our participation in the CLEAR 07 evaluation workshop head pose estimation tasks where two head pose estimation tasks were to be addressed. The first task estimates head poses with respect to (w.r.t.) a single camera capturing people seated in a meeting room scenario. The second task consisted of estimating the head pose of people moving in a room from four cameras w.r.t. a global room coordinate. To solve the first task, we used a probabilistic exemplar-based head pose tracking method using a mixed state particle filter based on a represention in a joint state space of head localization and pose variable. This state space representation allows the combined search for both the optimal head location and pose. To solve the second task, we first applied the same head tracking framework to estimate the head pose w.r.t each of the four camera. Then, using the camera calibration parameters, the head poses w.r.t. individual cameras were transformed into head poses w.r.t to the global room coordinates, and the measures obtained from the four cameras were fused using reliability measures based on skin detection. Good head pose tracking performances were obtained for both tasks.},
ipdmembership = {vision},
ipdxref = {techreport:ba-idiap-rr-07-21.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/ba-clear-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/ba-clear-2007.ps.gz}
}

@techreport{moore:rr06-03,
title = {The juicer lvcsr decoder - user manual for juicer version 0.5.0},
author = {Moore, D. },
year = {2006},
type = {IDIAP-COM},
number = {03},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
abstract = {Juicer is a decoder for HMM-based large vocabulary speech recognition that uses a weighted finite state transducer (WFST) representation of the search space. The package consists of a number of command line utilities: the Juicer decoder itself, along with a number of tools and scripts that are used to combine the various ASR knowledge sources (language model, pronunciation dictionary, acoustic models) into a single, optimised WFST that is input to the decoder.},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/moore-idiap-com-06-03.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/moore-idiap-com-06-03.ps.gz}
}

@incollection{Stolcke2008,
title = {The SRI-ICSI spring 2007 meeting and lecture recognition system},
author = {Stolcke, A.  and Anguera, X.  and Boakye, K.  and Cetin, O.  and Janin, A.  and Magimai-Doss, M.  and Wooters, C.  and Zheng, J. },
booktitle = {Multimodal Technologies for Perception of Humans},
year = {2008},
publisher = {Lecture Notes in Computer Science},
keywords = {Report_VII, IM2.AP, joint publication}
}

@inproceedings{Beekhof:2007:SPIE-RPH,
title = {Secure surface identification codes},
author = {Beekhof, F.  and Voloshynovskiy, S.  and Koval, O.  and Villán, R. },
editor = {Delp III, E. J.  and Wong, P. W.  and Dittmann, J.  and Memon, N. D. },
booktitle = {Steganography, and Watermarking of Multimedia Contents X},
series = {Proceedings of SPIE},
year = {2008},
volume = {6819},
address = {(SPIE, Bellingham, WA 2008) 68190D},
doi = {10.1117/12.765118},
keywords = {Report_VII, IM2.MPR}
}

@inproceedings{Vijayasenan_INTERSPEEH2009_2009,
title = {KL Realignment for Speaker Diarization with Multiple Feature Streams},
author = {Vijayasenan, D.  and Valente, F.  and Bourlard, H. },
booktitle = {10th Annual Conference of the International Speech Communication Association},
year = {2009},
keywords = {IM2.AP, Report_VIII},
abstract = {This paper aims at investigating the use of Kullback-Leibler (KL) divergence based realignment with application to speaker diarization. The use of KL divergence based realignment operates directly on the speaker posterior distribution estimates and is compared with traditional realignment performed using HMM/GMM system. We hypothesize that using posterior estimates to re-align speaker boundaries is more robust than gaussian mixture models in case of multiple feature streams with different statistical properties. Experiments are run on the NIST RT06 data. These experiments reveal that in case of conventional MFCC features the two approaches yields the same performance while the KL based system outperforms the HMM/GMM re-alignment in case of combination of multiple feature streams (MFCC and TDOA).},
projects = {Idiap,
AMIDA,
IM2}
}

@inproceedings{LTS-CONF-2007-062,
title = {Enabling Privacy For Distributed Video Coding by Transform Domain Scrambling},
author = {Ouaret, M.  and Dufaux, F.  and Ebrahimi, T. },
booktitle = {2008 SPIE Visual Communications and Image Processing},
year = {2008},
location = {San Diego, USA},
url = {http://infoscience.epfl.ch/getfile.py?recid=112162&mode=best},
keywords = {Report_VII, IM2.VP,Media Security; Privacy; Scrambling; Transform Domain; Distributed Video Coding},
abstract = {In this paper, a novel scheme for video scrambling is introduced for Distributed Video Coding. The goal is to conceal video information in several applications such as video surveillance and anonymous video communications to preserve privacy. This is achieved by performing a transform domain scrambling on both Key and Wyner-Ziv frames. More specifically, the sign of the scrambled transform coefficient is inverted at the encoder side. The scrambling pattern is defined by a secret key and the latter is required at the decoder for descrambling. The scheme is proven to provide a good level of security in addition to a flexible scrambling level (i.e the amount of distortion introduced). Finally, it is shown that the original DVC scheme and the one with scrambling have a similar rate distortion performance. In other words, the DVC compression efficiency is not negatively impacted by the introduction of the scrambling.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/112162},
oai-id = {oai:infoscience.epfl.ch:112162},
oai-set = {conf; fulltext},
review = {REVIEWED},
status = {PUBLISHED},
unit = {LTS MMSPL}
}

@article{Orabona_JMLR_2009,
title = {Bounded kernel-based perceptrons},
author = {Orabona, F.  and Keshet, J.  and Caputo, B. },
journal = {Journal of Machine Learning Research},
year = {2009},
volume = {Accepted for pub},
keywords = {IM2.AP, Report_VIII},
projects = {Idiap,
DIRAC}
}

@article{liwicki07size,
title = {Handwriting recognition of whiteboard notes -- studying the influence of training set size and type},
author = {Liwicki, M.  and Bunke, H. },
journal = {Int. Journal of Pattern Recognition and Art. Intelligence},
year = {2007},
volume = {21},
number = {1},
pages = {83--98},
keywords = {Report_VI, IM2.VP},
peer = {yes}
}

@article{liu07,
title = {Comparing Evaluation Metrics for Sentence Boundary Detection},
author = {Liu, Y.  and Shriberg, E. },
journal = {Proc. ICASSP, Honolulu},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{Drygajlo3,
title = {Impact of feature correlations on separation between bivariate normal distributions},
author = {Kryszczuk, K.  and Drygajlo, A. },
journal = {19th International Conference on Pattern Recognition},
year = {2008},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{Drygajlo2,
title = {Biometrics and identity management},
author = {Schouten, B.  and Juul, N.  and Drygajlo, A.  and Tistarelli, M. },
journal = {Lecture Notes in Computer Science 5372},
year = {2008},
publisher = {Springer},
keywords = {IM2.MPR, Report_VIII}
}

@techreport{Garg_Idiap-RR-22-2009,
title = {Co-occurrence Models for Image Annotation and Retrieval},
author = {Garg, N. },
year = {2009},
type = {Idiap-RR},
number = {Idiap-RR-22-2009},
institution = {Idiap},
note = {Ecole Polytechnique F\'ed\'erale de Lausanne - Master Thesis},
keywords = {IM2.MCA, Report_VIII},
abstract = {We present two models for content-based automatic image annotation and retrieval in web image repositories, based on the co-occurrence of tags and visual features in the images. In particular, we show how additional measures can be taken to address the noisy and limited tagging problems, in datasets such as Flickr, to improve performance. As in many state-of-the-art works, an image is represented as a bag of visual terms computed using edge and color information. The cooccurrence information of visual terms and tags is used to create models for image annotation and retrieval. The first model begins with a naive Bayes approach and then improves upon it by using image pairs as single documents to significantly reduce the noise and increase annotation performance. The second method models the visual terms and tags as a graph, and uses query expansion techniques to improve the retrieval performance. We evaluate our methods on the commonly used 150 concept Corel dataset, and a much harder 2000 concept Flickr dataset.},
projects = {Idiap},
}

@article{anguera07a,
title = {Model Complexity Selection and Cross-validation EM Training for Robust Speaker Diarization},
author = {Anguera, X.  and Shinozaki, T.  and Wooters, C.  and Hernando, J. },
journal = {Proc. ICASSP, Honolulu},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@article{anguera07b,
title = {Automatic Weighting for the Combination of TDOA and Acoustic Features in Speaker Diarization for Meetings},
author = {Anguera, X.  and Wooters, C.  and Pardo, J. M.  and Hernando, J. },
journal = {Proc. ICASSP, Honolulu},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{haketa:2008-44,
title = {In-context phone posteriors as complementary features for tandem asr},
author = {Ketabdar, H.  and Bourlard, H. },
journal = {ICSLP'08},
booktitle = {ICSLP'08},
year = {2008},
location = {Brisbane, Australia,},
keywords = {IM2.AP, Report_VII},
abstract = {In this paper, we present a method for integrating possible prior knowledge (such as phonetic and lexical knowledge), as well as acoustic context (e.g., the whole utterance) in the phone posterior estimation, and we propose to use the obtained posteriors as complementary posterior features in Tandem ASR configuration. These posteriors are estimated based on HMM state posterior probability definition (typically used in standard HMMs training). In this way, by integrating the appropriate prior knowledge and context, we enhance the estimation of phone posteriors. These new posteriors are called ?in-context? or HMM posteriors. We combine these posteriors as complementary evidences with the posteriors estimated from a Multi Layer Percep- tron (MLP), and use the combined evidence as features for training and inference in Tandem configuration. This approach has improved the performance, as compared to using only MLP estimated posteriors as features in Tandem, on OGI Numbers , Conversational Telephone speech (CTS), and Wall Street Journal (WSJ) databases.}
}

@inproceedings{Drygajlo8,
title = {Q-stack aging model for face verification},
author = {Zhu, K.  and Drygajlo, A.  and Li, W. },
journal = {European Signal Processing Conference (Eusipco 2009)},
year = {2009},
keywords = {IM2.MPR, Report_VIII}
}

@article{Jayagopi_tasl08,
title = {Modeling dominance in group conversations from nonverbal activity cues},
author = {Jayagopi, D.  and Hung, H.  and Yeo, C.  and Gatica-Perez, D. },
journal = {IEEE Trans. on Audio, Speech and Language Processing, Special Issue on Multimodal Processing for Speech-based Interactions, accepted for publication},
year = {2008},
keywords = {Report_VII, IM2.MPR}
}

@inproceedings{Jayagopi_IEEEINTERNATIONALCONFERENCEONMULTIMEDIA&EXPO(ICME2009)_2009,
title = {Characterising Conversationsal Group Dynamics Using Nonverbal Behaviour},
author = {Jayagopi, D.  and Bogdan, R.  and Gatica-Perez, D. },
booktitle = {Proceedings ICME 2009},
year = {2009},
keywords = {IM2.MCA, Report_VIII},
abstract = {This paper addresses the novel problemof characterizing conversational group dynamics. It is well documented in social psychology that depending on the objectives a group, the dynamics are different. For example, a competitive meeting has a different objective from that of a collaborative meeting. We propose a method to characterize group dynamics based on the joint description of a group members' aggregated acoustical nonverbal behaviour to classify two meeting datasets (one being cooperative-type and the other being competitive-type). We use 4.5 hours of real behavioural multi-party data and show that our methodology can achieve a classification rate of upto 100\%.},
projects = {Idiap,
AMIDA,
IM2},
}

@inproceedings{EnricoBertini20096a,
title = {Surveying the complementary roles of automatic data analysis and visualization in knowledge discovery},
author = {Bertini, E.  and Lalanne, D. },
booktitle = {Proceedings of ACM SIGKDD Workshop on Visual Analytics and Knowledge Discovery, VAKD '09, 15th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (VAKD 2009)},
year = {2009},
pages = {12--20},
keywords = {IM2.HMI, Report_VIII}
}

@techreport{Garg_Idiap-RR-21-2009,
title = {Tagging and Retrieving Images with Co-Occurrence Models: from Corel to Flickr},
author = {Garg, N.  and Gatica-Perez, D. },
year = {2009},
type = {Idiap-RR},
number = {Idiap-RR-21-2009},
institution = {Idiap},
keywords = {IM2.MCA, Report_VIII},
abstract = {This paper presents two models for content-based automatic image annotation and retrieval in web image repositories, based on the co-occurrence of tags and visual features in the images. In particular, we show how additional measures can be taken to address the noisy and limited tagging problems, in datasets such as Flickr, to improve performance. An image is represented as a bag of visual terms computed using edge and color information. The first model begins with a naive Bayes approach and then improves upon it by using image pairs as single documents to significantly reduce the noise and increase annotation performance. The second method models the visual features and tags as a graph, and uses query expansion techniques to improve the retrieval performance. We evaluate our methods on the commonly used 150 concept Corel dataset, and a much harder 2000 concept Flickr dataset.},
projects = {Idiap},
}

@inproceedings{Anemueller_ICCS2008_2008,
title = {Biologically Motivated Audio-Visual Cue Integration for Object},
author = {Anemuller, J.  and Back, J. -H.  and Caputo, B.  and Luo, J.  and Ohl, F.  and Orabona, F.  and Vogels, R.  and Weinshall, D.  and Zweig, A. },
booktitle = {Proceedings of the first Internatinal Conference on Cognitive Systems},
year = {2008},
keywords = {IM2.MPR, Report_VIII},
abstract = {Auditory and visual cues are important sensor inputs for biological and artificial systems. They provide crucial information for navigating environments, recognizing categories, animals and people. How to combine effectively these two sensory channels is still an open issue. As a step towards this goal, this paper presents a comparison between three different multi-modal integration strategies, for audiovisual object category detection. We consider a high-level and a low-level cue integration approach, both biologically motivated, and we compare them with a mid-level cue integration scheme. All the three integration methods are based on the least square support vector machine algorithm, and state of the art audio and visual feature representations. We conducted experiments on two audio-visual object categories, dogs and guitars, presenting different visual and auditory characteristics. Results show that the high-level integration scheme consistently performs better than single cue methods, and of the other two integration schemes. These findings confirm results from the neuroscience. This suggests that the high-level integration scheme is the most suitable approach for multi-modal cue integration for artificial cognitive systems.},
projects = {Idiap,
DIRAC},
}

@article{Romsdorfer:07,
title = {Text analysis and language identification for polyglot text-to-speech synthesis},
author = {Romsdorfer, H.  and Pfister, B. },
journal = {Speech Communication (Elsevier)},
year = {2007},
note = {(to appear)},
keywords = {Report_VI, IM2.AP}
}

@article{kosinov2008:spic,
title = {Spatially-consistent partial matching for intra- and inter-image prototype selection},
author = {Kosinov, S.  and Bruno, E.  and Marchand-Maillet, S. },
journal = {To appear in Signal Processing: Image Communication special issue on "Semantic Analysis for Interactive Multimedia Services"},
year = {2008},
keywords = {Report_VII, IM2.MCA}
}

@phdthesis{AndreasHumm200811,
title = {Modelling combined handwriting and speech modalities for user authentication},
author = {Humm, A. },
year = {2008},
school = {University of Fribourg, Switzerland},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{Bertini2007,
title = {Spiralview: a visual tool to improve monitoring and understanding of security data in corporate},
author = {Bertini, E.  and Hertzog, P.  and Lalanne, D. },
booktitle = {IEEE Symposium on Visual Analytics Science and Technology 2007 (VAST'07)},
year = {2007},
pages = {to appear},
keywords = {Report_VI, IM2.HMI}
}

@article{Salamin_IEEETRANSACTIONSONMULTIMEDIA_2009,
title = {Automatic Role Recognition in Multiparty Recordings: Using Social Affiliation Networks for Feature Extraction},
author = {Salamin, H.  and Favre, S.  and Vinciarelli, A. },
journal = {IEEE Transactions on Multimedia, To Appear},
year = {2009},
keywords = {IM2.MCA, Report_VIII},
abstract = {Automatic analysis of social interactions attracts increasing attention in the multimedia community. This paper considers one of the most important aspects of the problem, namely the roles played by individuals interacting in different settings. In particular, this work proposes an automatic approach for the recognition of roles in both production environment contexts (e.g., news and talk-shows) and spontaneous situations (e.g., meetings). The experiments are performed over roughly 90 hours of material (one of the largest databases used for role recognition in the literature) and show that the recognition effectiveness depends on how much the roles influence the behavior of people. Furthermore, this work proposes the first approach for modeling mutual dependences between roles and assesses its effect on role recognition performance.},
projects = {Idiap,
IM2,
SSPNet},
}

@inproceedings{Hung_fg8,
title = {Identifying dominant people in meetings from audio-visual sensors},
author = {Hung, H.  and Gatica-Perez, D. },
booktitle = {Proc. IEEE Int. Conf. on Automatic Face and Gesture Recognition, Special Session on Multimodal HCI for Smart Environments},
year = {2008},
keywords = {Report_VII, IM2.MPR},
abstract = {This paper provides an overview of the area of automated dominance estimation in group meetings. We describe research in social psychology and use this to explain the motivations behind suggested automated systems. With the growth in availability of conversational data captured in meeting rooms, it is possible to investigate how multi-sensor data allows us to characterize non-verbal behaviors that contribute towards dominance. We use an overview of our own work to address the challenges and opportunities in this area of research.}
}

@inproceedings{Li2008b,
title = {Neural network based regression for robust overlapping speech recognition using microphone arrays},
author = {Li, W.  and Dines, J.  and Magimai-Doss, M.  and Bourlard, H. },
booktitle = {Interspeech},
year = {2008},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@inproceedings{Li2008c,
title = {Effective post-processing of single-channel frequency-domain speech enhancement},
author = {Li, W. },
booktitle = {IEEE conference on multimedia and expo},
year = {2008},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@inproceedings{Li2008a,
title = {Mlp-based log spectral energy mapping for robust overlapping speech recognition},
author = {Li, W.  and Doss, M. M.  and Dines, J.  and Bourlard, H. },
booktitle = {European Signal Processing Conference},
year = {2008},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@article{Caputo_IVC_2009,
title = {Classifying Material in the Real World},
author = {Caputo, B.  and Hayman, E.  and Fritz, M.  and Ekluhnd, J. -O},
journal = {Image and vision Computing},
year = {2009},
volume = {accepted for pub},
keywords = {IM2.VP, IM2.MCR, Report_VIII},
projects = {Idiap,
DIRAC}
}

@inproceedings{Yao_ECCV-2_2008,
title = {Multi-camera multi-person 3d space tracking with mcmc in surveillance scenarios},
author = {Yao, J.  and Odobez, J. -M. },
booktitle = {European Conference on Computer Vision, workshop on Multi Camera and Multi-modal Sensor Fusion Algorithms and Applications (ECCV-M2SFA2)},
year = {2009},
location = {Marseille},
keywords = {IM2.VP, Report_VIII},
abstract = {We present an algorithm for the tracking of a variable number of 3D persons in a multi-camera setting with partial field-of-view overlap. The multi-object tracking problem is posed in a Bayesian framework and relies on a joint multi-object state space with individual object states defined in the 3D world. The Reversible Jump Markov Chain Monte Carlo (RJ-MCMC) method is used to efficiently search the state-space and recursively estimate the multi-object configuration. The paper presents several contributions: i) the use and extension of several key features for efficient and reliable tracking (e.g. the use of the MCMC framework for multiple camera MOT; the use of powerful human detector outputs in the MCMC proposals to automatically initialize/upyearobject tracks); ii) the definition of appropriate prior on the object state, to take into account the effects of 2D image measurement uncertainties on the 3D object state estimation due to depth effects; iii) a simple rectification method aligning people 3D standing direction with 2D image vertical axis, allowing to obtain better object measurements relying on rectangular boxes and integral images; iv) representing objects with multiple reference color histograms, to account for variability in color measurement due to changes in pose, lighting, and importantly multiple camera view points. Experimental results on challenging real-world tracking sequences and situations demonstrate the efficiency of our approach.},
projects = {Idiap,
CARETAKER},
}

@article{TSP2006,
title = {The gaussian transform of distributions: definition, computation and application},
author = {Alecu, T. I.  and Voloshynovskiy, S.  and Pun, T. },
journal = {IEEE Trans. on Signal Processing},
year = {2006},
volume = {54},
number = {8},
pages = {2976--2995},
keywords = {Report_VI, IM2.MPR}
}

@inproceedings{Koval:2008:MMSec,
title = {Privacy-preserving multimodal person and object identification},
author = {Koval, O.  and Voloshynovskiy, S.  and Pun, T. },
booktitle = {Proceedings of the 10th ACM Workshop on Multimedia & Security},
year = {2008},
keywords = {Report_VII, IM2.MPR}
}

@inproceedings{Pun3,
title = {Computational aspects of the eeg forward problem solution for real head model using finite element},
author = {Rytsar, R.  and Pun, T. },
booktitle = {29th Annual Int. Conf. IEEE Engineering in Medicine and Biology Society},
year = {2007},
keywords = {Report_VI, IM2.MPR}
}

@inproceedings{Koval:IWMSP:2007a,
title = {Analysis of multimodal binary detection systems based on dependent/independent modalities},
author = {Koval, O.  and Voloshynovskiy, S.  and Pun, T. },
booktitle = {Proceedings of the IEEE 2007 International Workshop on Multimedia Signal Processing},
year = {2007},
keywords = {Report_VII, IM2.MPR}
}

@article{eth_biwi_00517,
title = {Speeded-up robust features (surf)},
author = {Bay, H.  and Ess, A.  and Tuytelaars, T.  and van Gool, L. },
journal = {Computer Vision and Image Understanding (CVIU)},
year = {2007},
keywords = {Report_VII, IM2.VP.MCA, joint}
}

@article{ferrez:2008:ieee-tbme,
title = {Error-related eeg potentials generated during simulated brain-computer interaction},
author = {Ferrez, P. W.  and Millán, J. del R. },
journal = {IEEE Transactions on Biomedical Engineering},
year = {2008},
volume = {55},
number = {3},
pages = {923--929},
doi = {10.1109/tbme.2007.908083},
keywords = {IM2.BMI, Report_VII},
abstract = {Brain-computer interfaces (BCIs) are prone to errors in the recognition of subject's intent. An elegant approach to improve the accuracy of BCIs consists in a verification procedure directly based on the presence of error-related potentials (ErrP) in the EEG recorded right after the occurrence of an error. Several studies show the presence of ErrP in typical choice reaction tasks. However, in the context of a BCI, the central question is: "Are ErrP also elicited when the error is made by the interface during the recognition of the subject's intent?" We have thus explored whether ErrP also follow a feedback indicating incorrect responses of the simulated BCI interface. Five healthy volunteer subjects participated in a new human-robot interaction experiment, which seem to confirm the previously reported presence of a new kind of ErrP. But in order to exploit these ErrP we need to detect them in each single trial using a short window following the feedback associated to the response of the BCI. We have achieved an average recognition rate of correct and erroneous single trials of 83.5\% and 79.2\%, respectively using a classifier built with data recorded up to three months earlier.}
}

@article{buttfield:2006:ieee-tnsre,
title = {Towards a robust bci: error potentials and online learning},
author = {Buttfield, A.  and Ferrez, P. W.  and del R. Millán, J. },
journal = {IEEE Trans. on Neural Systems and Rehabilitation Engineering},
year = {2006},
volume = {14},
number = {2},
pages = {164--168},
keywords = {Report_VI, IM2.BMI},
abstract = {Recent advances in the field of Brain-Computer Interfaces (BCIs) have shown that BCIs have the potential to provide a powerful new channel of communication, completely independent of muscular and nervous systems. However, while there have been successful laboratory demonstrations, there are still issues that need to be addressed before BCIs can be used by non-experts outside the laboratory. At IDIAP we have been investigating several areas that we believe will allow us to improve the robustness, flexibility and reliability of BCIs. One area is recognition of cognitive error states, that is, identifying errors through the brain's reaction to mistakes. The production of these error potentials (ErrP) in reaction to an error made by the user is well established. We have extended this work by identifying a similar but distinct ErrP that is generated in response to an error made by the interface, (a misinterpretation of a command that the user has given). This ErrP can be satisfactorily identified in single trials and can be demonstrated to improve the theoretical performance of a BCI. A second area of research is online adaptation of the classifier. BCI signals change over time, both between sessions and within a single session, due to a number of factors. This means that a classifier trained on data from a previous session will probably not be optimal for a new session. In this paper we present preliminary results from our investigations into supervised online learning that can be applied in the initial training phase. We also discuss the future direction of this research, including the combination of these two currently separate issues to create a potentially very powerful BCI.},
ipdmembership = {learning},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/buttfield_2006_tnsre.pdf}
}

@inproceedings{eth_biwi_00535,
title = {Real-time face pose estimation from single range images},
author = {Breitenstein, M. D.  and Kuettel, D.  and Weise, T.  and van Gool, L.  and Pfister, H. },
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR'08)},
year = {2008},
publisher = {IEEE Press},
keywords = {Report_VII, IM2.VP}
}

@techreport{perrin:tech:2009,
title = {Dialog Management Technique for Brain-Computer Interfaces},
author = {Perrin, X.  and Chavarriaga, R.  and Pradalier, C.  and Millán, J. del R.  and Siegwart, R. },
year = {2009},
institution = {Autonomous Systems Lab, ETHZ},
keywords = {IM2.BMI, Report_VIII}
}

@inproceedings{lovitt:ICSLP:2007,
title = {On confusions in a phoneme recognizer},
author = {Lovitt, A.  and Pinto, J. P.  and Hermansky, H. },
year = {2007},
note = {IDIAP-RR 07-10},
keywords = {Report_VI, IM2.AP},
abstract = {In this paper, we analyze the confusions patterns at three places in the hybrid phoneme recognition system. The confusions are analyzed at the pronunciation, the posterior probability, and the phoneme recognizer levels. The confusions show significant structure that is similar at all levels. Some confusions also correlate with human psychoacoustic experiments in white masking noise. These structures imply that not all errors should be counted equally and that some phoneme distinctions are arbitrary. Understanding these confusion patterns can improve the performance of a recognizer by eliminating problematic phoneme distinctions. These principles are applied to a phoneme recognition system and the results show a marked improvement in the phone error rate. Confusion pattern analysis leads to a better way of choosing phoneme sets for recognition.},
ipdmembership = {speech},
ipdxref = {techreport:lovitt-idiap-rr-07-10.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/lovitt-ICSLP-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/lovitt-ICSLP-2007.ps.gz}
}

@article{kosinov2008:paa,
title = {Distance-based discriminant analysis method and its applications},
author = {Kosinov, S.  and Pun, T. },
journal = {Pattern Analysis and Applications},
year = {2008},
volume = {11},
number = {3-4},
pages = {227--246},
note = {(DOI: 10.1007/s10044-007-0082-x)},
url = {http://viper.unige.ch/documents/pdf/kosinov2008-paa.pdf},
keywords = {IM2.MCA, Report_VIII}
}

@inproceedings{eth_biwi_00533,
title = {Combining densely sampled form and motion for human action recognition},
author = {Schindler, K.  and van Gool, L. },
booktitle = {DAGM Annual Pattern Recognition Symposium},
year = {2008},
publisher = {Springer},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{millan:2007:isrr,
title = {An asynchronous and non-invasive brain-actuated wheelchair},
author = {Galán, F.  and Nuttin, M.  and Lew, E.  and Ferrez, P. W.  and Vanacker, G.  and Philips, J.  and van Brussel, H.  and Millán, J. del R. },
booktitle = {Proceedings of the 13th International Symposium on Robotics Research},
year = {2007},
volume = {128},
keywords = {IM2.BCI, Report_VII},
abstract = {Objectives: To develop a robust asynchronous and non-invasive brain-computer interface (BCI) for brain-actuated wheelchair driving, and to assess the system robustness over time and context. Methods: Two subjects were asked to mentally drive a simulated wheelchair from a starting point to a goal following a pre-specified path in a simulated environment. Each subject participated in 5 experimental sessions integrated by 10 trials each. The experimental sessions were carried on with different elapsed times between them (since one hour to two months) to assess the system robustness over time.The path was divided in seven stretches to assess the robustness over context. Results: The two subjects were able to reach 90\% (subject 1) and 80\% (subject 2) of the final goals one day after the calibration of the BCI system, and 100\% (subject 1) and 70\% (subject 2) two months later. Different performances were obtained over the different path stretches.}
}

@phdthesis{christos-dimitrakakis:phd-thesis:2006,
title = {Ensembles for sequence learning},
author = {Dimitrakakis, C. },
year = {2006},
school = {\'Ecole Polytechnique F\'ed\'erale de Lausanne},
keywords = {Report_VI, IM2.MPR, Ensembles, boosting, bagging, mixture of experts, speech recognition, reinforcement learning, exploration-exploitation, uncertainty, sequence learning, sequential decision making},
abstract = {This thesis explores the application of ensemble methods to sequential learning tasks. The focus is on the development and the critical examination of new methods or novel applications of existing methods, with emphasis on supervised and reinforcement learning problems. In both types of problems, even after having observed a certain amount of data, we are often faced with uncertainty as to which hypothesis is correct among all the possible ones. However, in many methods for both supervised and for reinforcement learning problems this uncertainty is ignored, in the sense that there is a single solution selected out of the whole of the hypothesis space. Apart from the classical solution of analytical Bayesian formulations, ensemble methods offer an alternative approach to representing this uncertainty. This is done simply through maintaining a set of alternative hypotheses. The sequential supervised problem considered is that of automatic speech recognition using hidden Markov models. The application of ensemble methods to the problem represents a challenge in itself, since most such methods can not be readily adapted to sequential learning tasks. This thesis proposes a number of different approaches for applying ensemble methods to speech recognition and develops methods for effective training of phonetic mixtures with or without access to phonetic alignment data. Furthermore, the notion of expected loss is introduced for integrating probabilistic models with the boosting approach. In some cases substantial improvements over the baseline system are obtained. In reinforcement learning problems the goal is to act in such a way as to maximise future reward in a given environment. In such problems uncertainty becomes important since neither the environment nor the distribution of rewards that result from each action are known. This thesis presents novel algorithms for acting nearly optimally under uncertainty based on theoretical considerations. Some ensemble-based representations of uncertainty (including a fully Bayesian model) are developed and tested on a few simple tasks resulting in performance comparable with the state of the art. The thesis also draws some parallels between a proposed representation of uncertainty based on gradient-estimates and on prioritised sweeping'' and between the application of reinforcement learning to controlling an ensemble of classifiers and classical supervised ensemble learning methods.},
ipdmembership = {Learning},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/christos-dimitrakakis-phd-thesis.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/christos-dimitrakakis-phd-thesis .ps.gz}
}

@inproceedings{2009-bologna-icta,
title = {The multi-touch see color interface},
author = {Bologna, G.  and Malandain, S.  and Deville, B.  and Pun, T. },
booktitle = {ICTA 2009, The 2nd International Conference on Information and Communication Technologies and Accessibility, Hammamet, Tunisia, May 7--9},
year = {2009},
keywords = {IM2.MCA, Report_VIII}
}

@article{Gurban2009,
title = {Information theoretic feature extraction for audio-visual speech recognition},
author = {Gurban, M.  and Thiran, J. -Ph. },
journal = {IEEE Trans. on Signal Processing},
year = {2009},
volume = {in press},
keywords = {IM2.MPR, Report_VIII}
}

@book{AMR2006,
title = {Adaptive multimedia retrieval: user, context and feedback},
author = {Marchand-Maillet, S.  and Bruno, E.  and Nürnberger, A.  and Detyniecki, M. },
editor = {van Rijsbergen, C. J. },
year = {2007},
publisher = {Springer},
keywords = {Report_VI, IM2.MCA}
}

@inproceedings{eth_biwi_00518,
title = {Event-based tracking evaluation metric},
author = {Roth, D.  and Koller-Meier, E.  and Rowe, D.  and Moeslund, T. B.  and van Gool, L. },
booktitle = {IEEE Workshop on Motion and Video Computing (WMVC)},
year = {2008},
keywords = {Report_VII, IM2.VP}
}

@inbook{BrunoDumas20093,
title = {Multimodal interfaces: a survey of principles, models and frameworks},
author = {Dumas, B.  and Lalanne, D.  and Oviatt, S. },
year = {\bibnodate},
keywords = {IM2.HMI, Report_VIII}
}

@techreport{Picart_Idiap-RR-18-2009,
title = {Improved Phone Posterior Estimation Through k-NN and MLP-Based Similarity},
author = {Picart, B. },
year = {2009},
type = {Idiap-RR},
number = {Idiap-RR-18-2009},
institution = {Idiap},
address = {Rue Marconi 19, 1920 Martigny - switzerland},
keywords = {IM2.AP, Report_VIII},
abstract = {In this work, we investigate the possible use of k-nearest neighbour (kNN) classifiers to perform frame-based acoustic phonetic classification, hence replacing Gaussian Mixture Models (GMM) or MultiLayer Perceptrons (MLP) used in standard Hidden Markov Models (HMMs). The driving motivation behind this idea is the fact that kNN is known to be an "optimal" classifier if a very large amount of training data is available (replacing the training of functional parameters by plain memorization of the training examples) and the correct distance metric is found. Nowadays, amount of training data is no longer an issue. In the current work, we thus specifically focused on the "correct" distance metric, mainly using an MLP to compute the probability that two input feature vectors are part of the same phonetic class or not. This MLP output can thus be used as a distance metric for kNN. While providing a "universal" distance metric, this work also enabled us to consider the speech recognition problem under a different angle, simply formulated in terms of hypothesis tests: "Given two feature vectors, what is the probability that these belong to the same (phonetic) class or not?". Actually, one of the main goals of the present thesis finally boils down to one interesting question: ''Is it easier to classify feature vectors into C phonetic classes or to tell whether or not two feature vectors belong to the same class?''. This work was done with standard acoustic features as inputs (PLP) and with posterior features (resulting of another pre-training MLP). Both feature sets indeed exhibit different properties and metric spaces. For example, while the use of posteriors as input is motivated by the fact that they are speaker and environment independent (so they capture much of the phonetic information contained in the signal), they are also no longer Gaussian distributed. When showing mathematically that using the MLP as a similarity measure makes sense, we discovered that this measure was equivalent to a very simple metric that can be analytically computed without needing the use of an MLP. This new type of measure is in fact the scalar product between two posterior feature vectors. Experiments have been conducted on hypothesis tests and on kNN classification. Results of the hypothesis tests show that posterior feature vectors achieve better performance than acoustic feature vectors. Moreover, the use of the scalar product leads to better performance than the use of all other metrics (including the MLP-based distance metric), whatever the input features.},
projects = {Idiap},
}

@inproceedings{Hennebert07:iss,
title = {Please repeat: my voice is my password. from the basics to real-life implementations of speaker verification technologies},
author = {Hennebert, J. },
booktitle = {Invited lecture at the Information Security Summit (IS2 2007), Prague},
year = {2007},
keywords = {Report_VI, IM2.MPR}
}

@inproceedings{Motlicek_TSD2008_2008,
title = {Perceptually motivated Sub-band Decomposition for FDLP Audio Coding},
author = {Motlicek, P.  and Ganapathy, S.  and Hermansky, H.  and Garudadri, H.  and Athineos, M. },
booktitle = {Text, Speech and Dialogue},
series = {Series of Lecture Notes in Artificial Intelligence (LNAI)},
year = {2008},
volume = {5246},
pages = {435--442},
publisher = {Springer-Verlag Berlin, Heidelberg},
location = {Brno, Czech Republic},
keywords = {Audio Coding, Frequency Domain Linear Prediction (FDLP), speech coding, IM2.VP,Report_VIII},
abstract = {This paper describes employment of non-uniform QMF decomposition to increase the efficiency of a generic wide-band audio coding system based on Frequency Domain Linear Prediction (FDLP). The base line FDLP codec, operating at high bit-rates ( 136 kbps), exploits a uniform QMF decomposition into 64 sub-bands followed by sub-band processing based on FDLP. Here, we propose a non-uniform QMF decomposition into 32 frequency sub-bands obtained by merging 64 uni- form QMF bands. The merging operation is performed in such a way that bandwidths of the resulting critically sampled sub-bands emulate the characteristics of the critical band filters in the human auditory system. Such frequency decomposition, when employed in the FDLP audio codec, results in a bit-rate reduction of 40\% over the base line. We also describe the complete audio codec, which provides high-fidelity audio compression at 66 kbps. In subjective listening tests, the FDLP codec outperforms MPEG-1 Layer 3 (MP3) and achieves similar qualities as MPEG-4 HE-AAC codec.},
projects = {Idiap},
}

@inproceedings{Kryszczuk2007ICBQstackFace,
title = {Improving classification with class-independent quality measures: q-stack in face verification},
author = {Kryszczuk, K.  and Drygajlo, A. },
booktitle = {Proc. 2nd Int. Conference in Biometrics (ICB 2007)},
year = {2007},
keywords = {Report_VI, IM2.MPR},
owner = {Andrzej}
}

@inproceedings{bruno2009:sigir,
title = {multiview clustering: a late fusion approach using latent models},
author = {Bruno, E.  and Marchand-Maillet, S. },
booktitle = {Proceedings of the 32nd ACM Special Interest Group on Information Retrieval Conference, SIGIR 09},
year = {2009},
keywords = {IM2.MCA, Report_VIII}
}

@article{Lalanne200710,
title = {Visual analysis of corporate network intelligence: abstracting and reasoning on yesterdays for acting today},
author = {Lalanne, D.  and Bertini, E.  and Hertzog, P.  and Bados, P. },
year = {2007},
keywords = {Report_VII, IM2.HMI}
}

@inproceedings{DenisLalanne200911,
title = {Fusion engines for multimodal interfaces: a survey},
author = {Lalanne, D.  and Nigay, L.  and Palanque, P.  and Robinson, P.  and Vanderdonckt, J.  and Ladry, J. -F. },
booktitle = {Proceedings of International Conference on Multimodal Interfaces and Workshop on Machine Learning for Multi-modal Interaction (ICMI-MLMI 2009)},
year = {2009},
keywords = {IM2.HMI, Report_VIII}
}

@inproceedings{LTS-CONF-2008-124,
title = {Using entropy as a stream reliability estimate for audio-visual speech recognition},
author = {Gurban, M.  and Thiran, J. -Ph. },
booktitle = {16th European Signal Processing Conference},
year = {2008},
location = {Lausanne, Switzerland},
url = {http://www.eusipco2008.org/, http://infoscience.epfl.ch/getfile.py?recid=125042&mode=best},
keywords = {Report_VII, IM2.MPR, LTS5},
abstract = {We present a method for dynamically integrating audio-visual information for speech recognition, based on the estimated reliability of the audio and visual streams. Our method uses an information theoretic measure, the entropy derived from the state probability distribution for each stream, as an estimate of reliability. The two modalities, audio and video, are weighted at each time instant according to their reliability. In this way, the weights vary dynamically and are able to adapt to any type of noise in each modality, and more importantly, to unexpected variations in the level of noise.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/125042},
oai-id = {oai:infoscience.epfl.ch:125042},
oai-set = {conf; fulltext; fulltext},
review = {REVIEWED},
status = {PUBLISHED},
unit = {LTS}
}

@inproceedings{LTS-CONF-2008-125,
title = {Dynamic modality weighting for multi-stream HMMs in Audio-Visual Speech Recognition},
author = {Gurban, M.  and Thiran, J. -Ph.  and Drugman, T.  and Dutoit, T. },
booktitle = {10th International Conference on Multimodal Interfaces},
year = {2008},
location = {Chania, Greece},
url = {http://www.telecom.tuc.gr/~potam/icmi2008/, http://infoscience.epfl.ch/getfile.py?recid=125044&mode=best},
keywords = {Report_VII, IM2.MPR, LTS5},
abstract = {Merging decisions from different modalities is a crucial problem in Audio-Visual Speech Recognition. To solve this, state synchronous multi-stream HMMs have been proposed for their important advantage of incorporating stream reliability in their fusion scheme. This paper focuses on stream weight adaptation based on modality confidence estimators. We assume different and time-varying environment noise, as can be encountered in realistic applications, and, for this, adaptive methods are best-suited. Stream reliability is assessed directly through classifier outputs since they are not specific to either noise type or level. The influence of constraining the weights to sum to one is also discussed.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/125044},
oai-id = {oai:infoscience.epfl.ch:125044},
oai-set = {conf; fulltext; fulltext},
review = {REVIEWED},
status = {SUBMITTED},
unit = {LTS}
}

@inproceedings{valente:Interspeech2:2008,
title = {On the combination of auditory and modulation frequency channels for asr applications},
author = {Valente, F.  and Hermansky, H. },
crossref = {valente:rr08-12},
booktitle = {Interspeech 2008},
year = {2008},
location = {Brisbane, Australia},
note = {IDIAP-RR 08-12},
keywords = {IM2.AP, Report_VII},
abstract = {This paper investigates the combination of evidence coming from different frequency channels obtained filtering the speech signal at different auditory and modulation frequencies. In our previous work citeicassp2008}
}

@article{hakkani07,
title = {Statistical Sentence Extraction for Information Distillation},
author = {Hakkani-Tur, D.  and Tur, G. },
journal = {Proc. ICASSP, Honolulu},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{LTS-CONF-2008-122,
title = {Fast multi-view face tracking with pose estimation},
author = {Meynet, J.  and Arsan, T.  and Cruz Mota, J.  and Thiran, J. -Ph. },
booktitle = {16th European Signal Processing Conference},
year = {2008},
location = {Lausanne},
url = {http://infoscience.epfl.ch/getfile.py?recid=125036&mode=best},
keywords = {Report_VII, IM2.VP, lts5; lts; face detection; face tracking; head pose; condensation},
abstract = {In this paper, a fast and an effective multi-view face tracking algorithm with head pose estimation is introduced. For modeling the face pose we employ a tree of boosted classi?ers built using either Haar-like ?lters or Gauss ?lters. A ?rst classi?er extracts faces of any pose from the background. Then more speci?c classi?ers discriminate between different poses. The tree of classi?ers is trained by hierarchically sub-sampling the pose space. Finally, Condensation algorithm is used for tracking the faces. Experiments show large improvements in terms of detection rate and processing speed compared to state-of-the-art algorithms.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/125036},
oai-id = {oai:infoscience.epfl.ch:125036},
oai-set = {conf; fulltext; fulltext},
review = {REVIEWED},
status = {ACCEPTED},
unit = {LTS}
}

@inproceedings{bertolami07nonuniform,
title = {Non-uniform slant correction for handwritten text line recognition},
author = {Bertolami, R.  and Uchida, S.  and Zimmermann, M.  and Bunke, H. },
booktitle = {Proc. 9th Int. Conf. on Document Analysis and Recognition},
year = {2007},
pages = {18--22},
isbn = {978-0-7695-2822-9},
keywords = {Report_VII, IM2.VP},
peer = {yes}
}

@inproceedings{Bologna:2008:WCCI,
title = {a perceptual interface for vision substitution in a color matching experiment},
author = {Bologna, G.  and Deville, B.  and Vinckenbosch, M.  and Pun, T. },
booktitle = {Proceeding on IEEE IJCNN, IEEE World congress on computational intelligence},
year = {2008},
keywords = {IM2.MCA, Report_VIII},
owner = {Bologna},
vgclass = {refpap},
vgproject = {bmi}
}

@inproceedings{Ba:ICME:2009,
title = {Visual activity context for focus of attention estimation in dynamic meetings},
author = {Ba, S.  and Hung, H.  and Odobez, J. -M. },
booktitle = {IEEE Proc. Int. Conf. on Multimedia and Expo (ICME)},
year = {2009},
keywords = {IM2.VP, Report_VIII}
}

@inproceedings{MichealBaechler20093,
title = {Labeled images verification using gaussian mixture models},
author = {Baechler, M.  and Bloechle, J. -L.  and Humm, A.  and Ingold, R.  and Hennebert, J. },
booktitle = {Proceedings of 24th Annual ACM Symposium on Applied Computing (ACM SAC'09)},
year = {2009},
pages = {1331--1336},
keywords = {IM2.VP, Report_VIII}
}

@inproceedings{Radgohar2006,
title = {Phong, augmenting virtual and real gaming experience (demo)},
author = {Radgohar, M.  and Ev\'equoz, F.  and Lalanne, D. },
booktitle = {Symposium on User Interface Software and Technology (UIST 2006)},
year = {2006},
pages = {71--72},
keywords = {Report_VI, IM2.HMI}
}

@inproceedings{Gerber:07a,
title = {Quasi text-independent speaker verification based on pattern matching},
author = {Gerber, M.  and Beutler, R.  and Pfister, B. },
booktitle = {Proceedings of Interspeech},
year = {2007},
organization = {ISCA},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{voloshynovskiy53,
title = {Privacy-preserving multimodal person and object identification},
author = {Koval, O.  and Voloshynovskiy, S.  and Caire, F.  and Bas, P. },
booktitle = {MM&Sec 2008},
year = {2008},
keywords = {IM2.MPR, Report_VIII}
}

@article{Besson2006_1508/LTSa,
title = {Extraction of audio features specific to speech production for multimodal speaker detection},
author = {Besson, P.  and Popovici, V.  and Vesin, J. M.  and Thiran, J. -Ph.  and Kunt, M. },
journal = {IEEE Transactions on Multimedia},
year = {2008},
volume = {10},
number = {1},
pages = {63--73},
doi = {na},
keywords = {Report_VII, IM2.MPR, joint publication, LTS1; LTS5; speaker detection; multimodal; feature extraction; besson p.}
}

@inproceedings{matena-mobilehci08,
title = {Graphical representation of meetings on mobile devices},
author = {Matena, L.  and Jaimes, A.  and Popescu-Belis, A. },
booktitle = {MobileHCI 2008 Demonstrations (10th ACM International Conference on Human-Computer Interaction with Mobile Devices and Services)},
year = {2008},
keywords = {Report_VII, IM2.HMI},
abstract = {The AMIDA Mobile Meeting Assistant is a system that allows remote participants to attend a meeting through a mobile device. The system improves the engagement in the meeting of the remote participants with respect to voice-only solutions thanks to the use of visual annotations and the capture of slides. The visual focus of attention of meeting participants and other annotations serve to reconstruct a 2D or a 3D representation of the meeting on a mobile device (smart phone). A rst version of the system has been implemented, and feedback from a user study and from industrial partners shows that the Mobile Meeting Assistant's functionalities are positively appreciated, and sets priorities for future developments.}
}

@article{lei07a,
title = {Word-conditioned phone N-grams for speaker recognition},
author = {Lei, H.  and Mirghafori, N. },
journal = {Proc. ICASSP, Honolulu},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{Faria2008,
title = {Corrected tandem features for acoustic model training},
author = {Faria, A.  and Morgan, N. },
booktitle = {International Conference on Acoustics, Speech, and Signal Processing},
year = {2008},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@article{lei07b,
title = {Word-Conditioned HMM Supervectors for Speaker Recognition},
author = {Lei, H.  and Mirghafori, N. },
journal = {to appear in Proceedings of Interspeech, Antwerp.},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{tsamuel:eusipco:2008,
title = {Spectro-temporal features for automatic speech recognition using linear prediction in spectral domain},
author = {Thomas, A.  and Ganapathy, S.  and Hermansky, H. },
crossref = {tsamuel:rr08-05},
booktitle = {16th European Signal Processing Conference (EUSIPCO 2008)},
year = {2008},
location = {Lausanne},
note = {IDIAP-RR 08-05},
keywords = {IM2.AP, Report_VII},
abstract = {Frequency Domain Linear Prediction (FDLP) provides an efficient way to represent temporal envelopes of a signal using auto-regressive models. For the input speech signal, we use FDLP to estimate temporal trajectories of sub-band energy by applying linear prediction on the cosine transform of sub-band signals. The sub-band FDLP envelopes are used to extract spectral and temporal features for speech recognition. The spectral features are derived by integrating the temporal envelopes in short-term frames and the temporal features are formed by converting these envelopes into modulation frequency components. These features are then combined in the phoneme posterior level and used as the input features for a hybrid HMM-ANN based phoneme recognizer. The proposed spectro-temporal features provide a phoneme recognition accuracy of $69.1 \%$ (an improvement of $4.8 \%$ over the Perceptual Linear Prediction (PLP) base-line) for the TIMIT database.}
}

@inproceedings{Rajan_INTERSPEECH_2009,
title = {Robustness of Phase based Features for Speaker Recognition},
author = {Rajan, P.  and Parthasarathi, S. H. K.  and Murthy, H. },
crossref = {Rajan_Idiap-RR-14-2009},
booktitle = {Proceedings of Interspeech},
year = {2009},
keywords = {IM2.AP, Report_VIII},
abstract = {This paper demonstrates the robustness of group-delay based features for speech processing. An analysis of group delay functions is presented which show that these features retain formant structure even in noise. Furthermore, a speaker verification task performed on the NIST 2003 database show lesser error rates, when compared with the traditional MFCC features. We also mention about using feature diversity to dynamically choose the feature for every claimed speaker.},
projects = {Idiap},
}

@inproceedings{MMSPL-CONF-20095-003,
title = {Video coding based on audio-visual attention},
author = {Lee, J. -S.  and De Simone, F.  and Ebrahimi, T. },
booktitle = {IEEE International Conference on Multimedia and Expo (ICME'09)},
year = {2009},
location = {New York, USA},
url = {http://infoscience.epfl.ch/getfile.py?recid=134765&mode=best},
keywords = {video coding, audio-visual attention, cross-modal interaction, source localization, H.264, perceived audio-visual quality, IM2.MCA, Report_VIII},
abstract = {This paper proposes an efficient video coding method based on audio-visual attention, which is motivated by the fact that cross-modal interaction significantly affects humans perception of multimedia experiences. First, we propose an audio-visual source localization method to locate the sound source in a video sequence. Then, its result is used for applying spatial blurring to the images in order to reduce redundant high-frequency information and achieve coding efficiency. We demonstrate the effectiveness of the proposed method for H.264/AVC coding along with the results of a subjective test.},
details = {http://infoscience.epfl.ch/record/134765},
oai-id = {oai:infoscience.epfl.ch:134765},
oai-set = {conf; fulltext-public; fulltext},
unit = {MMSPL}
}

@inproceedings{alovitt:ICSLP:2007,
title = {Truncation confusion patterns in onset consonants},
author = {Lovitt, A. },
booktitle = {Interspeech 2007},
year = {2007},
note = {IDIAP-RR 07-05},
keywords = {Report_VI, IM2.AP},
abstract = {Confusion matrices and truncation experiments have long been a part of psychoacoustic experimentation. However confusion matrices are seldom used to analyze truncation experiments. A truncation experiment was conducted and the confusion patterns were analyzed for 6 consonant-vowels (CVs). The confusion patterns show significant structure as the CV is truncated from the onset of the consonant. These confusions show correlations with both articulatory, acoustic features, and other related CVs. These confusions patterns are shown and explored as they relate to human speech recognition.},
ipdmembership = {speech},
ipdxref = {techreport:alovitt-idiap-rr-07-05.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/alovitt-ICSLP-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/alovitt-ICSLP-2007.ps.gz}
}

@inproceedings{sfavre:icmi-2008,
title = {Role recognition in multiparty recordings using social affiliation networks and discrete distributions},
author = {Favre, S.  and Salamin, H.  and Vinciarelli, A. },
booktitle = {The Tenth International Conference on Multimodal Interfaces (ICMI 2008)},
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-64-2008},
institution = {IDIAP},
location = {Chania, Greece},
keywords = {IM2.MCA, Report_VII},
abstract = {This paper presents an approach for the recognition of roles in multiparty recordings. The approach includes two major stages: extraction of Social Affiliation Networks (speaker diarization and representation of people in terms of their social interactions), and role recognition (application of discrete probability distributions to map people into roles). The experiments are performed over several corpora, including broadcast data and meeting recordings, for a total of roughly 90 hours of material. The results are satisfactory for the broadcast data (around 80 percent of the data time correctly labeled in terms of role), while they still must be improved in the case of the meeting recordings (around 45 percent of the data time correctly labeled). In both cases, the approach outperforms significantly chance.}
}

@article{JayagopiJournal08,
title = {Modeling dominance in group conversations using nonverbal activity cues},
author = {Jayagopi, D. },
journal = {IEEE Trans. on Audio, Speech, and Language Processing, Special Issue on Multimodal Processing for Speech-based Interactions},
year = {2009},
volume = {17},
pages = {501--513},
keywords = {IM2.MPR, Report_VIII}
}

@article{tur06,
title = {Model Adaptation for Dialog Act Tagging},
author = {Tur, G.  and Guz, U.  and Hakkani-Tur, D. },
journal = {Proc. IEEE/ACL Workshop on Spoken Language Technology},
year = {2006},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{grandvalet:ICML-1:2007,
title = {More efficiency in multiple kernel learning},
author = {Rakotomamonjy, A.  and Bach, F.  and Canu, S.  and Grandvalet, Y. },
booktitle = {International Conference on Machine Learning (ICML)},
year = {2007},
note = {IDIAP-RR 07-18},
keywords = {Report_VI, IM2.MPR},
abstract = {An efficient and general multiple kernel learning (MKL) algorithm has been recently proposed by singleemcitesonnenburg_mkljmlr. This approach has opened new perspectives since it makes the MKL approach tractable for large-scale problems, by iteratively using existing support vector machine code. However, it turns out that this iterative algorithm needs several iterations before converging towards a reasonable solution. In this paper, we address the MKL problem through an adaptive 2-norm regularization formulation. Weights on each kernel matrix are included in the standard SVM empirical risk minimization problem with a $ell_1$ constraint to encourage sparsity. We propose an algorithm for solving this problem and provide an new insight on MKL algorithms based on block 1-norm regularization by showing that the two approaches are equivalent. Experimental results show that the resulting algorithm converges rapidly and its efficiency compares favorably to other MKL algorithms.},
ipdmembership = {learning},
ipdxref = {techreport:grandvalet-idiap-rr-07-18.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/grandvalet-ICML-1-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/grandvalet-ICML-1-2007.ps.gz}
}

@inproceedings{1181047,
title = {Detection and application of influence rankings in small group meetings},
author = {Rienks, R.  and Zhang, D.  and Gatica-Perez, D.  and Post, W. },
booktitle = {ICMI '06: Proceedings of the 8th international conference on Multimodal interfaces},
year = {2006},
pages = {257--264},
publisher = {ACM Press},
address = {New York, NY, USA},
isbn = {1-59593-541-X},
doi = {10.1145/1180995.1181047},
keywords = {Report_VI, IM2.MPR}
}

@inproceedings{pop09-icmimlmi09,
title = {A multimedia retrieval system using speech input},
author = {Popescu-Belis, A.  and Poller, P.  and Kilgour, J.  and Boertjes, E.  and Carletta, J.  and Castronovo, S.  and Fapso, M.  and Flynn, M.  and Nanchen, A.  and Wilson, T.  and Wit, J. de and Yazdani, M. },
booktitle = {ICMI-MLMI 2009 (11th International Conference on Multimodal Interfaces and 6th Workshop on Machine Learning for Multimodal Interaction)},
year = {2009},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{motlicek:mlmi:2007,
title = {Frequency domain linear prediction for qmf sub-bands and applications to audio coding},
author = {Motlicek, P.  and Hermansky, H.  and Ganapathy, S.  and Garudadri, H. },
crossref = {motlicek:rr07-16},
booktitle = {4th Joint Workshop on Multimodal Interaction and Related Machine Learning Algorithms (MLMI)},
series = {Lecture Notes in Computer Science},
year = {2007},
pages = {248--258},
note = {IDIAP-RR 07-16},
keywords = {IM2.AP, Report_VI},
abstract = {This paper proposes an analysis technique for wide-band audio applications based on the predictability of the temporal evolution of Quadrature Mirror Filter (QMF) sub-band signals. The input audio signal is first decomposed into 64 sub-band signals using QMF decomposition. The temporal envelopes in critically sampled QMF sub-bands are approximated using frequency domain linear prediction applied over relatively long time segments (e.g. 1000 ms). Line Spectral Frequency parameters related to autoregressive models are computed and quantized in each frequency sub-band. The sub-band residuals are quantized in the frequency domain using a combination of split Vector Quantization (VQ) (for magnitudes) and uniform scalar quantization (for phases). In the decoder, the sub-band signal is reconstructed using the quantized residual and the corresponding quantized envelope. Finally, application of inverse QMF reconstructs the audio signal. Even with simple quantization techniques and without any sophisticated modules, the proposed audio coder provides encouraging results in objective quality tests. Also, the proposed coder is easily scalable across a wide range of bit-rates.}
}

@inproceedings{Vergyri:ICSA:2008,
title = {Development of the sri/nightingale arabic asr system},
author = {Vergyri, D.  and Mandal, A.  and Wang, W.  and Stolcke, A.  and Zheng, J.  and Graciarena, M.  and Rybach, D.  and Gollan, C.  and Schlater, R.  and Kirchoff, K.  and Faria, A.  and Morgan, N. },
booktitle = {9th International Conference of the ISCA (Interspeech 2008), Brisbane, Australia},
year = {2008},
pages = {1437--1440},
keywords = {IM2.AP, Report_VIII}
}

@inproceedings{Yao_CVPR-VS2007_2007,
title = {Multi-layer background subtraction based on color and texture},
author = {Yao, J.  and Odobez, J. -M. },
crossref = {Yao_Idiap-RR-67-2007},
booktitle = {CVPR 2007 Workshop on Visual Surveillance (VS2007)},
year = {2007},
volume = {17-22},
pages = {1--8},
doi = {10.1109/cvpr.2007.383497},
keywords = {IM2.VP, Report_VI},
abstract = {In this paper, we propose a robust multi-layer background subtraction technique which takes advantages of local texture features represented by local binary patterns (LBP) and photometric invariant color measurements in RGB color space. LBP can work robustly with respective to light variation on rich texture regions but not so efficiently on uniform regions. In the latter case, color information should overcome LBP\^as limitation. Due to the illumination invariance of both the LBP feature and the selected color feature, the method is able to handle local illumination changes such as cast shadows from moving objects. Due to the use of a simple layer-based strategy, the approach can model moving background pixels with quasiperiodic flickering as well as background scenes which may vary over time due to the addition and removal of long-time stationary objects. Finally, the use of a cross-bilateral filter allows to implicitly smooth detection results over regions of similar intensity and preserve object boundaries. Numerical and qualitative experimental results on both simulated and real data demonstrate the robustness of the proposed method.}
}

@inproceedings{Noceti_ICIAP_2009,
title = {Towards a theoretical framework for learning multi-modal patterns for embodied agents},
author = {Noceti, N.  and Caputo, B.  and Castellini, C.  and Baldassarre, L.  and Barla, A.  and Rosasco, L.  and Odone, F.  and Sandini, G. },
booktitle = {International Conference on Image Analysis and Processing},
year = {2009},
keywords = {IM2.MPR, Report_VIII},
projects = {Idiap,
DIRAC},
}

@inproceedings{millan:2008:icra,
title = {Brain-controlled robots},
author = {Millán, J. del R. },
journal = {IEEE International Conference on Robotics and Automation, ICRA 2008.},
booktitle = {IEEE International Conference on Robotics and Automation (ICRA 2008)},
series = {ATR Computational Neuroscience Laboratories},
year = {2008},
isbn = {978-1-4244-1646-2},
issn = {1050-4729},
doi = {10.1109/robot.2008.4543175},
keywords = {IM2.BMI, Report_VII},
abstract = {The idea of moving robots or prosthetic devices not by manual control, but by mere ü{\i}\^A?\^Afrac1/2thinkingü{\i}\^A?\^Afrac1/2 (i.e., the brain activity of human subjects) has fascinated researchers for the last 30 years, but it is only now that first experiments have shown the possibility to do so. How can brainwaves be used to directly control robots? Most of the hope for braincontrolled robots comes from invasive approaches that provide detailed single neuron activity recorded from microelectrodes implanted in the brain [1]. The motivation for these invasive approaches is that it has been widely shown that motor parameters related to hand and arm movements are encoded in a distributed and redundant way by ensembles of neurons in the motor system of the brainü{\i}\^A?\^Afrac1/2motor, premotor and posterior parietal cortex. For humans, however, it is preferable to use non-invasive approaches to avoid health risks and the associated ethical concerns. Most non-invasive brain-computer interfaces (BCI) use electroencephalogram (EEG) signals; i.e., the electrical brain activity recorded from electrodes placed on the scalp. The main source of the EEG is the synchronous activity of thousands of cortical neurons. Thus, EEG signals suffer from a reduced spatial resolution and increased noise due to measurements on the scalp. As a consequence, current EEG-based brain-actuated devices are limited by a low channel capacity and are considered too slow for controlling rapid and complex sequences of robot movements. But, recently, we have shown for the first time that online analysis of EEG signals, if used in combination with advanced robotics and machine learning techniques, is sufficient for humans to continuously control a mobile robot [2] and a wheelchair [3]. In this article we will review our work on non-invasive brain-controlled robots and discuss some of the challenges ahead.}
}

@article{Baker:ISPM:2009,
title = {Research developments and directions in speech recognition and understanding},
author = {Baker, J.  and Deng, L.  and Glass, J.  and Khudanpur, S.  and Lee, C. -H.  and Morgan, N.  and O'Shgughnessy, D. },
journal = {IEEE Signal Processing Magazine},
year = {2009},
volume = {26},
number = {4},
pages = {78--85},
keywords = {IM2.AP, Report_VIII}
}

@article{EnricoBertini20096,
title = {Extended excentric labeling},
author = {Bertini, E.  and Lalanne, D.  and Rigamonti, M. },
journal = {International Journal of the Eurographics Association},
year = {2009},
volume = {28},
keywords = {IM2.HMI, Report_VIII}
}

@inproceedings{marchand2009:wiamis,
title = {Optimizing strategies for the exploration of social-networks and associated data collections},
author = {Marchand-Maillet, S.  and Szekely, E.  and Bruno, E. },
booktitle = {Proceedings of the International Workshop on Image Analysis for Multimedia Interactive Services (WIAMIS'09) - Special session on "People, Pixels, Peers: Interactive Content in Social Networks"},
year = {2009},
note = {(invited)},
url = {http://viper.unige.ch/documents/pdf/marchand2009_wiamis.pdf},
keywords = {IM2.MCA, Report_VIII}
}

@inproceedings{Kumatani_INTERSPEECH_2008,
title = {Maximum kurtosis beamforming with the generalized sidelobe canceller},
author = {Kumatani, K.  and McDonough, J.  and Rauch, B.  and Garner, P. N.  and Li, W.  and Dines, J. },
booktitle = {Proceedings of INTERSPEECH, September 2008},
year = {2009},
location = {Brisbane, Australia},
keywords = {IM2.AP, Report_VIII},
abstract = {This paper presents an adaptive beamforming application based on the capture of far-field speech data from a real single speaker in a real meeting room. After the position of a speaker is estimated by a speaker tracking system, we construct a subband-domain beamformer in generalized sidelobe canceller (GSC) configuration. In contrast to conventional practice, we then optimize the active weight vectors of the GSC so that kurtosis of output signals is maximized. Our beamforming algorithms can suppress noise and reverberation without the signal cancellation problems encountered in conventional beamforming algorithms. We demonstrate the effectiveness of our proposed techniques through a series of automatic speech recognition experiments on the Multi-Channel Wall Street Journal Audio Visual Corpus (MC-WSJ-AV). The beamforming algorithm proposed here achieved a 13.6\% WER, whereas the simple delay-and-sum beamformer provided a WER of 17.8\%.},
projects = {AMIDA},
}

@inproceedings{gonzalez-et-al-2008,
title = {Automated delineation of dendritic networks in noisy image stacks},
author = {Gonzalez, G.  and Fleuret, F.  and Fua, P. },
booktitle = {Proceedings of the European Conference on Computer Vision (ECCV)},
year = {2008},
pages = {214--227},
keywords = {IM2.VP, Report_VIII}
}

@inproceedings{gonzalez-et-al-2009,
title = {Learning rotational features for filament detection},
author = {Gonzalez, G.  and Fleuret, F.  and Fua, P. },
booktitle = {Proceedings of the IEEE international conference on Computer Vision and Pattern Recognition (CVPR)},
year = {2009},
note = {(to appear)},
keywords = {IM2.VP, Report_VIII}
}

@techreport{luo:rr06-52,
title = {Incremental learning for place recognition in dynamic environments},
author = {Luo, J.  and Pronobis, A.  and Caputo, B.  and Jensfelt, P. },
year = {2006},
type = {IDIAP-RR},
number = {52},
institution = {IDIAP},
keywords = {Report_VI, IM2.MPR.HMI, joint publication},
abstract = {Vision--based place recognition is a desirable feature for an autonomous mobile system. In order to work in realistic scenarios, a visual recognition algorithm should have two key properties: robustness and adaptability. This paper focuses on the latter, and presents a discriminative incremental learning approach to place recognition. We use a recently introduced version of the fixed--partition incremental SVM, which allows to control the memory requirements as the system updates its internal representation. At the same time, it preserves the recognition performance of the batch algorithm and runs online. In order to assess the method, we acquired a database capturing the intrinsic variability of places over time. Extensive experiments show the power and the potential of the approach.},
ipdmembership = {vision},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/luo-idiap-rr-06-52.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/luo-idiap-rr-06-52.ps.gz}
}

@techreport{Mesot2007,
title = {A gaussian sum smoother for inference in switching linear dynamical systems},
author = {Mesot, B.  and Barber, D. },
journal = {IEEE Signal Processing Letters},
year = {2007},
institution = {Idiap Research Institute},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@inproceedings{Valente_INTERSPEECH2009-2_2009,
title = {Hierarchical Processing of the Modulation Spectrum for GALE Mandarin LVCSR system},
author = {Valente, F.  and Magimai-Doss, M.  and Plahl, C.  and Suman, R. },
booktitle = {Proceedings of the 10thAnnual Conference of the International Speech Communication Association (Interspeech)},
year = {2009},
location = {Brighton},
keywords = {speech recognition, TANDEM features, IM2.AP, Report_VIII},
abstract = {This paper aims at investigating the use of TANDEM features based on hierarchical processing of the modulation spectrum. The study is done in the framework of the GALE project for recognition of Mandarin Broadcast data. We describe the improvements obtained using the hierarchical processing and the addition of features like pitch and short-term critical band energy. Results are consistent with previous findings on a different LVCSR task suggesting that the proposed technique is effective and robust across several conditions. Furthermore we describe integration into RWTH GALE LVCSR system trained on 1600 hours of Mandarin data and present progress across the GALE 2007 and GALE 2008 RWTH systems resulting in approximatively 20\% CER reduction on several data set.},
projects = {Idiap},
}

@incollection{Keshet_WILEY-3_2009,
title = {A Proposal for a Kernel-based Algorithm for Large Vocabulary Continuous Speech Recognition},
author = {Keshet, J. },
editor = {Keshet, J.  and Bengio, S. },
booktitle = {Automatic Speech and Speaker Recognition: Large Margin and Kernel Methods},
year = {2009},
publisher = {John Wiley and Sons},
keywords = {IM2.AP, Report_VIII},
abstract = {We present a proposal of a kernel-based model for large vocabulary continuous speech recognizer. The continuous speech recognition is described as a problem of finding the best phoneme sequence and its best time span, where the phonemes are generated from all permissible word sequences. A non-probabilistic score is assigned to every phoneme sequence and time span sequence, according to a kernel-based acoustic model and a kernel-based language model. The acoustic model is described in terms of segments, where each segment corresponds to a whole phoneme, and it generalizes Segmental Models for the non-probabilistic setup. The language model is based on discriminative language model recently proposed by Roark et al. (2007). We devise a loss function based on the word error rate and present a large margin training procedure for the kernel models, which aims at minimizing this loss function. Finally, we discuss the practical issues of the implementation of kernel-based continuous speech recognition model by presenting an efficient iterative algorithm and considering the decoding process. We conclude the chapter by a brief discussion on the model limitations and future work. This chapter does not introduce any experimental results.},
projects = {Idiap}
}

@phdthesis{Mesot2008,
title = {Switching linear dynamical systems for noise robust speech recognition of isolated degits},
author = {Mesot, B. },
year = {2008},
school = {STI School of Engineering, EPFL},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@techreport{pronobis:rr07-17,
title = {Confidence-based cue integration for visual place recognition},
author = {Pronobis, A.  and Caputo, B. },
year = {2007},
type = {IDIAP-RR},
number = {17},
institution = {IDIAP},
keywords = {Report_VI, IM2.VP},
abstract = {A distinctive feature of intelligent systems is their capability to analyze their level of expertise for a give task; in other words, they know what they know. As a way towards this ambitious goal, this paper presents an algorithm for recognition able to measure its own level of confidence and, in case of uncertainty, to seek for extra information so to increase its own knowledge and ultimately achieve better performance. We focus on the visual place recognition problem for topological localization, and we take an SVM approach. We propose a new method for measuring the confidence level of the classification output, based on the distance of a test image and the average distance of training vectors. This method is combined with a discriminative accumulation scheme for cue integration. We show with extensive experiments that the resulting algorithm achieves better performances for two visual cues than the classic single cue SVM on the same task, while minimising the computational load. More important, our method provides a reliable measure of the level of confidence of the decision.},
ipdmembership = {vision},
pdf = {ftp://ftp.idiap.ch/pub/reports/2007/pronobis-idiap-rr-07-17.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2007/pronobis-idiap-rr-07-17.ps.gz}
}

@inproceedings{aradilla:icassp:2007,
title = {An acoustic model based on kullback-leibler divergence for posterior features},
author = {Aradilla, G.  and Vepa, J.  and Bourlard, H. },
booktitle = {IEEE Int. Conf. on Acoustics, Speech, and Signal Processing (ICASSP)},
year = {2007},
note = {IDIAP-RR 06-60},
keywords = {Report_VI, IM2.AP},
abstract = {This paper investigates the use of features based on posterior probabilities of subword units such as phonemes. These features are typically transformed when used as inputs for a hidden Markov model with mixture of Gaussians as emission distribution (HMM/GMM). In this work, we introduce a novel acoustic model that avoids the Gaussian assumption and directly uses posterior features without any transformation. This model is described by a finite state machine where each state is characterized by a target distribution and the cost function associated to each state is given by the Kullback-Leibler (KL) divergence between its target distribution and the posterior features. Furthermore, hybrid HMM/ANN system can be seen as a particular case of this KL-based model where state target distributions are predefined. A training method is also presented that minimizes the KL-divergence between the state target distributions and the posteriors features.},
ipdmembership = {speech},
}

@inproceedings{Tommasi_IMAGECLEF_2008,
title = {An SVM Confidence-Based Approach to Medical Image Annotation},
author = {Tommasi, T.  and Orabona, F.  and Caputo, B. },
editor = {Peters, C.  and Giampiccolo, D.  and Ferro, N. },
booktitle = {Evaluating Systems for Multilingual and Multimodal Information Access -- 9th Workshop of the Cross-Language Evaluation Forum},
series = {LNCS},
year = {2008},
keywords = {IM2.MPR, Report_VIII},
abstract = {This paper presents the algorithms and results of the ''idiap'' team participation to the ImageCLEFmed annotation task in 2008. On the basis of our successful experience in 2007 we decided to integrate two different local structural and textural descriptors. Cues are com- bined through concatenation of feature vectors and through the Multi- Cue Kernel. The challenge this year was to annotate images coming mainly from classes with only few training examples. We tackled the problem on two fronts: (1) we introduced a further integration strategy using SVM as an opinion maker; (2) we enriched the poorly populated classes adding virtual examples. We submitted several runs considering different combinations of the proposed techniques. The run jointly using the feature concatenation, the confidence-based opinion fusion and the virtual examples ranked first among all submissions.},
projects = {Idiap,
EMMA},
}

@article{Maganti_tasl08,
title = {Speech enhancement and recognition in meetings with an audio-visual sensor array},
author = {McCowan, I.  and Maganti, H. K.  and Gatica-Perez, D. },
journal = {IEEE Trans. on Audio, Speech, and Language Processing},
year = {2007},
volume = {15},
number = {8},
pages = {2257--2269},
keywords = {Report_VII, IM2.MPR}
}

@incollection{millan:2006:mit-online,
title = {Adaptation in brain-computer interfaces},
author = {Millán, J. del R.  and Buttfield, A.  and Vidaurre, C.  and Krauledat, M.  and Schlögl, A.  and Shenoy, P.  and Blankertz, B.  and Rao, R. P. N.  and Cabeza, R.  and Pfurtscheller, G.  and Müller, K. -R. },
editor = {Dornhege, G.  and Millán, J. del R.  and Hinterberger, T.  and McFarland, D.  and Müller, K. -R. },
booktitle = {Towards Brain-Computer Interfacing},
year = {2007},
publisher = {The MIT Press},
keywords = {IM2.BCI, Report_VII},
abstract = {One major challenge in Brain-Computer Interface (BCI) research is to cope with the inherent nonstationarity of the recorded brain signals caused by changes in the subjects brain processes during an experiment. Online adaptation of the classifier embedded into the BCI is a possible way of tackling this issue. In this chapter we investigate the effect of adaptation on the performance of the classifier embedded in three different BCI systems, all of them based on non-invasive electroencephalogram (EEG) signals. Through this adaptation we aim to keep the classifier constantly tuned to the EEG signals it is receiving in the current session. Although the experimental results reported here show the benefits of online adaptation, some questions need still to be addressed. The chapter ends discussing some of these open issues.}
}

@article{huang07a,
title = {Robust and rapid speaker diarization},
author = {Huang, Y. },
journal = {Master Thesis, University of California, Berkeley},
year = {2007},
keywords = {Report_VII, IM2.AP}
}

@article{huang07b,
title = {A Fast-Match approach for robust, faster than real-time Speaker Diarization},
author = {Huang, Y.  and Vinyals, O.  and Friedland, G.  and Müller, C.  and Mirghafori, N.  and Wooters, C. },
journal = {IEEE workshop on Automatic Speech Recognition and Understanding (ASRU 07), Kyoto},
year = {2007},
keywords = {Report_VII, IM2.AP}
}

@article{huang07c,
title = {Speeding up speaker diarization by using prosodic features},
author = {Huang, Y.  and Friedland, G.  and Müller, C.  and Mirghafori, N. },
journal = {Technical Report TR-07-004, International Computer Science Institute, Berkeley, California},
year = {2007},
keywords = {Report_VII, IM2.AP}
}

@article{Eurasip2007,
title = {Transforming 3d coloured pixels into musical instrument notes for vision substitution applications},
author = {Bologna, G.  and Deville, B.  and Pun, T.  and Vinckenbosch, M. },
journal = {Eurasip J. of Image and Video Processing, Special Issue: Image and Video Processing for Disability, accepted for publication},
year = {2007},
note = {(to appear)},
keywords = {Report_VI, IM2.MPR}
}

@inproceedings{Chiquet2006,
title = {Elcano, a tangible multimedia browser (demo).},
author = {Chiquet, H.  and Ev\'equoz, F.  and Lalanne, D. },
booktitle = {Symposium on User Interface Software and Technology (UIST 2006)},
year = {2006},
pages = {51--52},
keywords = {Report_VI, IM2.HMI}
}

@inproceedings{Naturel_ICPR_2008,
title = {Detecting queues at vending machines: a statistical layered approach},
author = {Naturel, X.  and Odobez, J. -M. },
crossref = {naturel:rr08-04},
booktitle = {Proc. Int. Conf. on Pattern Recognition (ICPR)},
year = {2008},
location = {Tampa},
keywords = {IM2.MPR, Report_VIII},
abstract = {This paper presents a method for monitoring activities at a ticket vending machine in a video-surveillance context. Rather than relying on the output of a tracking module, which is prone to errors, the events are direclty recognized from image measurements. This especially does not require tracking. A statistical layered approach is proposed, where in the first layer, several sub-events are defined and detected using a discriminative approach. The second layer uses the result of the first and models the temporal relationships of the high-level event using a Hidden Markov Model (HMM). Results are assessed on 3h30 hours of real video footage coming from Turin metro station.},
projects = {Idiap,
CARETAKER},
}

@inproceedings{Koval:ACM2006,
title = {Information-theoretic analysis of steganalysis in real images},
author = {Koval, O.  and Voloshynovskiy, S.  and Holotyak, T.  and Pun, T. },
booktitle = {ACM Multimedia and Security Workshop 2006},
year = {2006},
url = {http://vision.unige.ch/publications/postscript/2006/mmsec169-koval.ps},
keywords = {Report_VI, IM2.MPR},
vgclass = {refpap},
vgproject = {watermarking}
}

@article{Meynet2006_1476/LTS,
title = {Face Detection with Boosted Gaussian Features},
author = {Meynet, J.  and Popovici, V.  and Thiran, J. -Ph. },
journal = {Pattern Recognition},
year = {2007},
volume = {40},
number = {8},
pages = {2283--2291},
doi = {10.1016/j.patcog.2007.02.001},
keywords = {Report_VI, Adaboost, face detection, Gaussian features, lts5, IM2.VP, joint publication},
abstract = {Detecting faces in images is a key step in numerous computer vision applications, such as face recognition or facial expression analysis. Automatic face detection is a difficult task because of the large face intra-class variability which is due to the important influence of the environmental conditions on the face appearance. We propose new features based on anisotropic Gaussian filters for detecting frontal faces in complex images. The performances of our face detector based on these new features have been evaluated on reference test sets, and clearly show improvements compared to the state-of-the-art.}
}

@techreport{buttfield:rr06-16,
title = {Online classifier adaptation in brain-computer interfaces},
author = {Buttfield, A.  and del R. Millán, J. },
year = {2006},
type = {IDIAP-RR},
number = {16},
institution = {IDIAP},
keywords = {Report_VI, IM2.BMI},
abstract = {Brain-computer interfaces (BCIs) aim to provide a new channel of communication by enabling the subject to control an external systems by using purely mental commands. One method of doing this without invasive surgical procedures is by measuring the electrical activity of the brain on the scalp through electroencephalography (EEG). A major obstacle to developing complex EEG-based BCI systems that provide a number of intuitive mental commands is the high variability of EEG signals. EEG signals from the same subject vary considerably within a single session and between sessions on the same or different days. To deal with this we are investigating methods of adapting the classifier while it is being used by the subject. By keeping the classifier constantly tuned to the EEG signals of the current session we hope to improve the performance of the classifier and allow the subject to learn to use the BCI more effectively. This paper discusses preliminary offline and online experiments towards this goal, focusing on the initial training period when the task that the subject is trying to achieve is known and thus supervised adaptation methods can be used. In these experiments the subjects were asked to perform three mental commands (imagination of left and right hand movements, and a language task) and the EEG signals were classified with a Gaussian classifier.},
ipdmembership = {learning},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/buttfield-idiap-rr-06-16.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/buttfield-idiap-rr-06-16.ps.gz}
}

@inproceedings{pinto:TSD:2008,
title = {Reverse correlation for analyzing mlp posterior features in asr},
author = {Pinto, J. P.  and Sivaram, G. S. V. S.  and Hermansky, H. },
crossref = {pinto:rr08-13},
booktitle = {11th International Conference on Text, Speech and Dialogue (TSD)},
year = {2008},
pages = {469--476},
location = {Brno, Czech Republic},
note = {IDIAP-RR 08-13},
doi = {10.1007/978-3-540-87391-4_60},
keywords = {IM2.AP, Report_VII},
abstract = {In this work, we investigate the reverse correlation technique for analyzing posterior feature extraction using an multilayered perceptron trained on multi-resolution RASTA (MRASTA) features. The filter bank in MRASTA feature extraction is motivated by human auditory modeling. The MLP is trained based on an error criterion and is purely data driven. In this work, we analyze the functionality of the combined system using reverse correlation analysis.}
}

@inproceedings{LTS-CONF-2007-063,
title = {Recent advances in brain-computer interfaces},
author = {Hoffmann, U.  and Vesin, J. M.  and Ebrahimi, T. },
booktitle = {IEEE International Workshop on Multimedia Signal Processing},
year = {2007},
location = {Chania, Crete, Greece},
note = {Invited Paper},
url = {http://www.mmsp2007.org/},
keywords = {Report_VII, IM2.BMI},
abstract = {A brain-computer interface (BCI) is a communication system that translates brain activity into commands for a computer or other devices. In other words, a BCI allows users to act on their environment by using only brain activity, without using peripheral nerves and muscles. The major goal of BCI research is to develop systems that allow disabled users to communicate with other persons, to control artificial limbs, or to control their environment. An alternative application area for brain-computer interfaces (BCIs) lies in the field of multimedia communication. To develop systems for usage in the field of assistive technology or multimedia communication, many aspects of BCI systems are currently being investigated. Research areas include evaluation of invasive and noninvasive technologies to measure brain activity, evaluation of control signals (i.e. patterns of brain activity that can be used for communication), development of algorithms for translation of brain signals into computer commands, and the development of new BCI applications. In this paper we give an introduction to some of the aspects of BCI research mentioned above, present a concrete example of a BCI system, and highlight recent developments and open problems.},
details = {}
}

@inproceedings{KryszDryg2,
title = {Impact of feature correlations on separation between bivariate normal distributions},
author = {Kryszczuk, K.  and Drygajlo, A. },
booktitle = {19th International Conference on Pattern Recognition},
year = {2008},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{Friedland_ACMMM_2009,
title = {Visual Speaker Localization Aided by Acoustic Models},
author = {Friedland, G.  and Yeo, C.  and Hung, H. },
booktitle = {ACM Multimedia},
year = {2009},
keywords = {IM2.MPR, Report_VIII},
abstract = {The following paper presents a novel audio-visual approach for unsupervised speaker locationing. Using recordings from a single, low-resolution room overview camera and a single far-field microphone, a state-of-the art audio-only speaker localization system (traditionally called speaker diarization) is extended so that both acoustic and visual models are estimated as part of a joint unsupervised optimization problem. The speaker diarization system first automatically determines the number of speakers and estimates ''who spoke when'', then, in a second step, the visual models are used to infer the location of the speakers in the video. The experiments were performed on real-world meetings using 4.5 hours of the publicly available AMI meeting corpus. The proposed system is able to exploit audio-visual integration to not only improve the accuracy of a state-of-the-art (audioonly) speaker diarization, but also adds visual speaker locationing at little incremental engineering and computation costs.},
projects = {Idiap,
AMIDA,
IM2}
}

@article{KokFroVer-ICME.08,
title = {Fast keyword detection with sparse time-frequency models},
author = {Kokiopoulou, E.  and Frossard, P.  and Verscheure, O. },
journal = {IEEE Int. Conf. on Multimedia & Expo (ICME)},
year = {2008},
keywords = {Report_VII, IM2.DMA.VP, joint publication}
}

@techreport{vijayasenan:rr07-31,
title = {Agglomerative information bottleneck for speaker diarization of meetings data},
author = {Valente, F.  and Bourlard, H.  and Deepu, V. },
year = {2007},
type = {IDIAP-RR},
number = {31},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {Report_VI, IM2.AP},
abstract = {In this paper, we investigate the use of agglomerative Information Bottleneck (aIB) clustering for the speaker diarization task of meetings data. In contrary to the state-of-the-art diarization systems that models individual speakers with Gaussian Mixture Models, the proposed algorithm is completely non parametric . Both clustering and model selection issues of non-parametric models are addressed in this work. The proposed algorithm is evaluated on meeting data on the RT06 evaluation data set. The system is able to achieve Diarization Error Rates comparable to state-of-the-art systems at a much lower computational complexity.},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2007/vijayasenan-idiap-rr-07-31.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2007/vijayasenan-idiap-rr-07-31.ps.gz}
}

@article{janin06a,
title = {The ICSI-SRI Spring 2006 Meeting Evaluation System},
author = {Janin, A.  and Stolcke, A.  and Anguera, X.  and Boakye, K.  and Cetin, O.  and Frankel, J.  and Zheng, J. },
journal = {In S. Renals and S. Bengio, editors, Machine Learning for Multimodal Interaction: Third International Workshop (MLMI 2006); Lecture Notes in Computer Science. Springer},
year = {2006},
keywords = {Report_VI, IM2.AP}
}

@article{Kryszczuk2008,
title = {Credence estimation and error prediction in biometric identity verification},
author = {Kryszczuk, K.  and Drygajlo, A. },
journal = {Signal Processing},
year = {2008},
volume = {88},
number = {4},
pages = {916--925},
url = {http://www.sciencedirect.com/science/article/B6V18-4PWKSHC-4/1/f19a72696595422518223e8f9e83f9a4},
keywords = {Report_VII, IM2.MPR, Biometric identity verification}
}

@article{bertolami08hidden,
title = {Hidden Markov model based ensemble methods for offline handwritten text line recognition},
author = {Bertolami, R.  and Bunke, H. },
journal = {Pattern Recognition},
year = {2008},
volume = {41},
number = {11},
pages = {3452--3460},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@inproceedings{bertolami08including,
title = {Including language model information in the combination of handwritten text line recognizers},
author = {Bertolami, R.  and Bunke, H. },
booktitle = {Proc. 11th Int. Conf. on Frontiers in Handwriting Recognition},
year = {2008},
pages = {25--30},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@inproceedings{Billard007,
title = {Calibration-free eye gaze direction detection with gaussian processes},
author = {Noris, B.  and Benmachiche, K.  and Billard, A. },
booktitle = {International Conference on Computer Vision Theory and Applications (VISAPP 08)},
year = {\bibnodate},
keywords = {IM2.MPR, Report_VIII}
}

@article{LTS-ARTICLE-2008-065,
title = {Cooperative object segmentation and behavior inference in image sequences},
author = {Gui, L.  and Thiran, J. -Ph.  and Paragios, N. },
journal = {International Journal of Computer Vision},
year = {2008},
issn = {0920-5691},
doi = {10.1007/s11263-008-0146-4},
keywords = {Report_VII, IM2.VP, image segmentation; behavior inference; gesture recognition; LTS5}
}

@article{LTS-ARTICLE-2007-014,
title = {Matching pursuit-based shape representation and recognition using scale-space},
author = {Mendels, F.  and Thiran, J. -Ph.  and Vandergheynst, P. },
journal = {International Journal of Imaging Systems and Technology},
year = {2006},
volume = {6},
number = {15},
pages = {162--180},
url = {http://infoscience.epfl.ch/getfile.py?recid=101552&mode=best},
doi = {na},
keywords = {Report_VI, matching pursuit ; scale-space shape representation ;shape recognition ; sparse representation; LTS2; LTS5; lts2; lts5, IM2.VP},
details = {http://infoscience.epfl.ch/search.py?recid=101552},
oai-id = {oai:infoscience.epfl.ch:101552},
oai-set = {article},
review = {REVIEWED},
status = {PUBLISHED},
unit = {LTS}
}

@article{LTS-ARTICLE-2007-015,
title = {Information Theoretic Combination of Classifiers},
author = {Meynet, J.  and Thiran, J. -Ph. },
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
year = {2008},
note = {ITS},
doi = {na},
keywords = {Report_VII, IM2.VP, combination of classifiers, information theory, support ; vector machines, diversity, majority voting, ensembles, lts5},
abstract = {Combining several classifiers has proved to be an effective machine learning technique. Two concepts clearly influence the performances of an ensemble of classifiers: the diversity between classifiers and the individual accuracies of the classifiers. In this paper we propose an information theoretic framework to establish a link between these quantities. As they appear to be contradictory, we propose an information theoretic score (ITS) that expresses a trade-off between individual accuracy and diversity. This technique can be directly used, for example, for selecting an optimal ensemble in a pool of classifiers. We perform experiments in the context of overproduction and selection of classifiers. We show that the selection based on the ITS outperforms state-of-the-art diversity-based selection techniques.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/102328},
oai-id = {oai:infoscience.epfl.ch:102328},
oai-set = {article},
review = {REVIEWED},
status = {SUBMITTED},
unit = {LTS}
}

@article{LTS-ARTICLE-2007-012,
title = {An efficient p300-based brain-computer interface for disabled subjects},
author = {Hoffmann, U.  and Vesin, J. M.  and Ebrahimi, T.  and Diserens, K. },
journal = {Journal of Neuroscience Methods},
year = {2008},
volume = {167},
number = {1},
pages = {115--125},
note = {Datasets and MATLAB-Code are available at http://bci.epfl.ch},
doi = {doi:10.1016/j.jneumeth.2007.03.005},
keywords = {Report_VII, IM2.BMI,LTS1},
abstract = {A brain-computer interface (BCI) is a communication system that translates brain-activity into commands for a computer or other devices. In other words, a BCI allows users to act on their environment by using only brain-activity, without using peripheral nerves and muscles. In this paper, we present a BCI that achieves high classification accuracy and high bitrates for both disabled and able-bodied subjects. The system is based on the P300 evoked potential and is tested with five severely disabled and four able-bodied subjects. For four of the disabled subjects classification accuracies of 100\% are obtained. The bitrates obtained for the disabled subjects range between 10 and 25 bits/min. The effect of different electrode configurations and machine learning algorithms on classification accuracy is tested. Further factors that are possibly important for obtaining good classification accuracy in P300-based BCI systems for disabled subjects are discussed.},
details = {}
}

@article{LTS-ARTICLE-2008-061,
title = {Ensembles of SVMs using an Information Theoretic Criterion},
author = {Meynet, J.  and Thiran, J. -Ph. },
journal = {Pattern Recognition Letters},
year = {2008},
note = {ITS},
keywords = {Report_VII, IM2.VP, support vector machines; combination of classifiers; ensembles; information theory; diversity; lts5},
abstract = {Training Support Vector Machines (SVMs) can become very challenging in large scale datasets. The problem can be addressed by training several lower complexity SVMs on local subsets of the training set. In fact, combining the resulting SVMs in parallel can significantly reduce the training complexity and also improve the classification performances. In order to obtain effective classifier ensembles, classifiers need to be both diverse and individually accurate. In this paper we propose an algorithm for training ensembles of SVMs by taking into account the diversity between each parallel classifier. For this, we use an information theoretic criterion that expresses a trade-off between individual accuracy and diversity. The parallel SVMs are trained jointly using an adaptation of the Kernel-Adatron algorithm for learning on-line multiple SVMs. The results are compared to standard multiple SVMs techniques on reference large scale datasets.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/117788},
oai-id = {oai:infoscience.epfl.ch:117788},
oai-set = {article},
review = {REVIEWED},
status = {SUBMITTED},
unit = {LTS}
}

@inproceedings{LTS-CONF-2007-019,
title = {Information Theoretic Combination of Classifiers with Application to AdaBoost},
author = {Meynet, J.  and Thiran, J. -Ph. },
booktitle = {7th international Workshop on Multiple Classifier Systems (MCS), Prague},
year = {2007},
location = {Prague},
note = {ITS},
keywords = {Report_VI, ITS, lts5, pattern recognition, classifier combination, information theory, adaboost, diversity; IM2.VP},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/search.py?recid=104339},
oai-id = {oai:infoscience.epfl.ch:104339},
oai-set = {conf},
review = {REVIEWED},
status = {PUBLISHED},
unit = {LTS}
}

@article{Armstrong-8-ISSCO,
title = {Le r\^ole des m\'etriques d'\'evaluation dans le processus de recherche en tal},
author = {Popescu-Belis, A. },
journal = {TAL (Traitement Automatique des Langues)},
year = {2007},
volume = {47},
number = {2},
keywords = {Report_VI, IM2.DMA}
}

@inproceedings{pinto:IS:2008,
title = {Combining evidence from a generative and a discriminative model in phoneme recognition},
author = {Pinto, J. P.  and Hermansky, H. },
crossref = {pinto:rr08-20},
booktitle = {Proceedings of Interspeech 2008},
year = {2008},
location = {Brisbane, Australia},
note = {IDIAP-RR 08-20},
keywords = {IM2.AP, Report_VII},
abstract = {We investigate the use of the log-likelihood of the features obtained from a generative Gaussian mixture model, and the posterior probability of phonemes from a discriminative multilayered perceptron in multi-stream combination for recognition of phonemes. Multi-stream combination techniques, namely early integration and late integration are used to combine the evidence from these models. By using multi-stream combination, we obtain a phoneme recognition accuracy of 74\% on the standard TIMIT database, an absolute improvement of 2.5\% over the single best stream.}
}

@article{Ferrez08,
title = {Error-Related EEG Potentials Generated During Simulated Brain-Computer Interaction},
author = {Ferrez, P. W.  and Millán, J. del R. },
journal = {IEEE Trans. on Biomedical Engineering},
year = {2008},
volume = {55},
number = {3},
pages = {923--929},
keywords = {IM2.BMI, Report_VIII}
}

@article{tsamuel:ieee-letters:2008,
title = {Recognition of reverberant speech using frequency domain linear prediction},
author = {Thomas, A.  and Ganapathy, S.  and Hermansky, H. },
crossref = {tsamuel:rr08-41},
journal = {IEEE Signal Processing Letters},
year = {2008},
note = {IDIAP-RR 08-41},
keywords = {IM2.AP, Report_VII},
abstract = {Performance of a typical automatic speech recognition (ASR) system severely degrades when it encounters speech from reverberant environments. Part of the reason for this degradation is the feature extraction techniques that use analysis windows which are much shorter than typical room impulse responses. We present a feature extraction technique based on modeling temporal envelopes of the speech signal in narrow sub-bands using Frequency Domain Linear Prediction (FDLP). FDLP provides an all-pole approximation of the Hilbert envelope of the signal obtained by linear prediction on cosine transform of the signal. ASR experiments on speech data degraded with a number of room impulse responses (with varying degrees of distortion) show significant performance improvements for the proposed FDLP features when compared to other robust feature extraction techniques (average relative reduction of $24 \%$ in word error rate). Similar improvements are also obtained for far-field data which contain natural reverberation in background noise. These results are achieved without any noticeable degradation in performance for clean speech.}
}

@inproceedings{LTS-CONF-2007-010,
title = {Dimensionality Reduction with Adaptive Approximation},
author = {Kokiopoulou, E.  and Frossard, P. },
booktitle = {IEEE Int. Conf. on Multimedia & Expo (ICME)},
year = {2007},
location = {Beijing, China},
url = {http://infoscience.epfl.ch/getfile.py?recid=100973&mode=best},
keywords = {Report_VI, IM2.VP},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/search.py?recid=100973},
oai-id = {oai:infoscience.epfl.ch:100973},
oai-set = {conf},
review = {REVIEWED},
status = {ACCEPTED},
unit = {LTS}
}

@phdthesis{norman-poh:phd-thesis:2006,
title = {Multi-system biometric authentication: optimal fusion and user-specific information},
author = {Poh, N. },
year = {2006},
school = {\'Ecole Polytechnique F\'ed\'erale de Lausanne},
keywords = {Report_VI, IM2.MPR, multiple classifier system, pattern recognition, user-specific processing},
abstract = {Verifying a person's identity claim by combining multiple biometric systems (fusion) is a promising solution to identity theft and automatic access control. This thesis contributes to the state-of-the-art of multimodal biometric fusion by improving the understanding of fusion and by enhancing fusion performance using information specific to a user. One problem to deal with at the score level fusion is to combine system outputs of different types. Two statistically sound representations of scores are probability and log-likelihood ratio (LLR). While they are equivalent in theory, LLR is much more useful in practice because its distribution can be approximated by a Gaussian distribution, which makes it useful to analyze the problem of fusion. Furthermore, its score statistics (mean and covariance) conditioned on the claimed user identity can be better exploited. Our first contribution is to estimate the fusion performance given the class-conditional score statistics and given a particular fusion operator/classifier. Thanks to the score statistics, we can predict fusion performance with reasonable accuracy, identify conditions which favor a particular fusion operator, study the joint phenomenon of combining system outputs with different degrees of strength and correlation and possibly correct the adverse effect of bias (due to the score-level mismatch between training and test sets) on fusion. While in practice the class-conditional Gaussian assumption is not always true, the estimated performance is found to be acceptable. Our second contribution is to exploit the user-specific prior knowledge by limiting the class-conditional Gaussian assumption to each user. We exploit this hypothesis in two strategies. In the first strategy, we combine a user-specific fusion classifier with a user-independent fusion classifier by means of two LLR scores, which are then weighted to obtain a single output. We show that combining both user-specific and user-independent LLR outputs always results in improved performance than using the better of the two. In the second strategy, we propose a statistic called the user-specific F-ratio, which measures the discriminative power of a given user based on the Gaussian assumption. Although similar class separability measures exist, e.g., the Fisher-ratio for a two-class problem and the d-prime statistic, F-ratio is more suitable because it is related to Equal Error Rate in a closed form. F-ratio is used in the following applications: a user-specific score normalization procedure, a user-specific criterion to rank users and a user-specific fusion operator that selectively considers a subset of systems for fusion. The resultant fusion operator leads to a statistically significantly increased performance with respect to the state-of-the-art fusion approaches. Even though the applications are different, the proposed methods share the following common advantages. Firstly, they are robust to deviation from the Gaussian assumption. Secondly, they are robust to few training data samples thanks to Bayesian adaptation. Finally, they consider both the client and impostor information simultaneously.},
ipdmembership = {Learning},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/norman-poh-phd-thesis.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/norman-poh-phd-thesis.ps.gz}
}

@inproceedings{apb-lrec-workshop-2008,
title = {Reference-based vs. task-based evaluation of human language technology},
author = {Popescu-Belis, A. },
booktitle = {LREC 2008 ELRA Workshop on Evaluation: "Looking into the Future of Evaluation: When automatic metrics meet task-based and performance-based approaches"},
year = {2008},
pages = {12--16},
organization = {ELRA},
keywords = {Report_VII, IM2.DMA},
abstract = {This paper starts from the ISO distinction of three types of evaluation procedures \^a internal, external and in use \^a and proposes to match these types to the three types of human language technology (HLT) systems: analysis, generation, and interactive. The paper explains why internal evaluation is not suitable to measure the qualities of HLT systems, and shows that reference-based external evaluation is best adapted to \^aanalysis\^a systems, task-based evaluation to \^ainteractive\^a systems, while \^ageneration\^a systems can be subject to both types of evaluation. In particular, some limits of reference-based external evaluation are shown in the case of generation systems. Finally, the paper shows that contextual evaluation, as illustrated by the FEMTI framework for MT evaluation, is an effective method for getting reference-based evaluation closer to the users of a system.}
}

@techreport{Motlicek_Idiap-RR-71-2008,
title = {Entropy coding of Quantized Spectral Components in FDLP audio codec},
author = {Motlicek, P.  and Ganapathy, S.  and Hermansky, H. },
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-71-2008},
institution = {Idiap},
keywords = {IM2.AP, Report_VIII},
abstract = {Audio codec based on Frequency Domain Linear Prediction (FDLP) exploits auto-regressive modeling to approximate instantaneous energy in critical frequency sub-bands of relatively long input segments. Current version of the FDLP codec operating at 66 kbps has shown to provide comparable subjective listening quality results to the state-of-the-art codecs on similar bit-rates even without employing strategic blocks, such as entropy coding or simultaneous masking. This paper describes an experimental work to increase compression efficiency of the FDLP codec provided by employing entropy coding. Unlike traditionally used Huffman coding in current audio coding systems, we describe an efficient way to exploit Arithmetic coding to entropy compress quantized magnitude spectral components of the sub-band FDLP residuals. Such approach outperforms Huffman coding algorithm and provides more than 3 kbps bit-rate reduction.},
projects = {Idiap,
IM2},
}

@inproceedings{Armstrong-4-ISSCO,
title = {Medslt: a multi-lingual grammar-based medical speech translator},
author = {Bouillon, P.  and Chatzichrisafis, N.  and Halimi, S.  and Hockey, B. A.  and Isahara, H.  and Kanzaki, K.  and Nakao, Y.  and Novellas Vall, B.  and Rayner, M.  and Santaholma, M.  and Starlander, M. },
booktitle = {Proceedings of First International Workshop on Intercultural Collaboration},
year = {2007},
organization = {IWIC2007},
location = {Kyoto, Japan},
keywords = {Report_VI, IM2.HMI, January 25-26}
}

@article{millan:2007:cin-wheelchair,
title = {Context-based filtering for assisted brain-actuated wheelchair driving},
author = {Vanacker, G.  and Millán, J. del R.  and Lew, E.  and Ferrez, P. W.  and Galán, F.  and Philips, J.  and van Brussel, H.  and Nuttin, M. },
journal = {Computational Intelligence and Neuroscience},
year = {2007},
volume = {2007},
pages = {3},
issn = {1687-5265},
keywords = {IM2.BCI, Report_VI},
abstract = {Controlling a robotic device by using human brain signals is an interesting and challenging task. The device may be complicated to control and the non-stationary nature of the brain signals provides for a rather unstable input. With the use of intelligent processing algorithms adapted to the task at hand however, the performance can be increased. This paper introduces a shared control system that helps the subject in driving an intelligent wheelchair with a non-invasive brain interface. The subject's steering intentions are estimated from electroencephalogram (EEG) signals and passed through to the shared control system before being sent to the wheelchair motors. Experimental results show a possibility for significant improvement in the overall driving performance when using the shared control system compared to driving without it. These results have been obtained with 2 healthy subjects during their first day of training with the brain-actuated wheelchair.}
}

@inproceedings{pinto:icassp-phnrecog:2008,
title = {Exploiting contextual information for improved phoneme recognition},
author = {Pinto, J. P.  and Hermansky, H.  and Yegnanarayana, B.  and Magimai-Doss, M. },
crossref = {pinto:rr07-65},
booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing, (ICASSP 2008)},
year = {2008},
pages = {4449--4452},
location = {Las Vegas, NV},
isbn = {978-1-4244-1483-3},
issn = {1520-6149},
note = {IDIAP-RR 07-65},
doi = {10.1109/icassp.2008.4518643},
keywords = {IM2.AP, Report_VII},
abstract = {In this paper, we investigate the significance of contextual information in a phoneme recognition system using the hidden Markov model - artificial neural network paradigm. Contextual information is probed at the feature level as well as at the output of the multilayerd perceptron. At the feature level, we analyse and compare different methods to model sub-phonemic classes. To exploit the contextual information at the output of the multilayered perceptron, we propose the hierarchical estimation of phoneme posterior probabilities. The best phoneme (excluding silence) recognition accuracy of 73.4\% on the TIMIT database is comparable to that of the state-of-the-art systems, but more emphasis is on analysis of the contextual information.}
}

@inproceedings{Cuendet2007,
title = {An analysis of sentence segmentation features for broadcast news, broadcast conversations, and meetings},
author = {Cuendet, S.  and Shriberg, E.  and Favre, B.  and Fung, J.  and Hakkani-Tur, D. },
booktitle = {SIGIR Workshop on Searching Conversational Spontaneous Speech},
year = {2007},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@inproceedings{Wollmer_ICASSP_2009,
title = {Robust Discriminative Keyword Spotting for Emotionally Colored Spontaneous Speech using Bidirectional LSTM Networks},
author = {Wöllmer, M.  and Eyben, F.  and Keshet, J.  and Graves, A.  and Schuller, B.  and Rigoll, G. },
booktitle = {IEEE International Conference on Acoustic, Speech, and Signal Processing},
year = {2009},
keywords = {IM2.AP, Report_VIII},
abstract = {In this paper we propose a new technique for robust keyword spotting that uses bidirectional Long Short-Term Memory (BLSTM) recurrent neural nets to incorporate contextual information in speech decoding. Our approach overcomes the drawbacks of generative HMM modeling by applying a discriminative learning procedure that non-linearly maps speech features into an abstract vector space. By incorporating the outputs of a BLSTM network into the speech features, it is able to make use of past and future context for phoneme predictions. The robustness of the approach is evaluated on a keyword spotting task using the HUMAINE Sensitive Artificial Listener (SAL) database, which contains accented, spontaneous, and emotionally colored speech. The test is particularly stringent because the system is not trained on the SAL database, but only on the TIMIT corpus of read speech. We show that our method prevails over a discriminative keyword spotter without BLSTM-enhanced feature functions, which in turn has been proven to outperform HMM-based techniques.},
projects = {Idiap},
}

@techreport{jkeshet-idiap-rr-07-44,
title = {Theoretical foundations for large-margin kernel-based continuous speech recognition},
author = {Keshet, J. },
year = {2007},
type = {Idiap-RR},
number = {Idiap-RR-44-2007},
institution = {IDIAP},
keywords = {IM2.AP, Report_VII}
}

@inproceedings{Ullah_ICRA_2008,
title = {Towards Robust Place Recognition for Robot Localization},
author = {Ullah, M. M.  and Pronobis, A.  and Caputo, B.  and Luo, J.  and Jensfelt, P.  and Christensen, H. I. },
booktitle = {IEEE International Conference on Robotics ad Automation},
year = {2008},
keywords = {IM2.VP, Report_VIII},
abstract = {Localization and context interpretation are two key competences for mobile robot systems. Visual place recognition, as opposed to purely geometrical models, holds promise of higher flexibility and association of semantics to the model. Ideally, a place recognition algorithm should be robust to dynamic changes and it should perform consistently when recognizing a room (for instance a corridor) in different geographical locations. Also, it should be able to categorize places, a crucial capability for transfer of knowledge and continuous learning. In order to test the suitability of visual recognition algorithms for these tasks, this paper presents a new database, acquired in three different labs across Europe. It contains image sequences of several rooms under dynamic changes, acquired at the same time with a perspective and omnidirectional camera, mounted on a socket. We assess this new database with an appearance based algorithm that combines local features with support vector machines through an ad-hoc kernel. Results show the effectiveness of the approach and the value of the database},
projects = {Idiap,
DIRAC},
}

@inproceedings{LTS-CONF-2007-099,
title = {Relevant Feature Selection for Audio-Visual Speech Recognition},
author = {Drugman, T.  and Gurban, M.  and Thiran, J. -Ph. },
booktitle = {9th International Workshop on Multimedia Signal Processing (MMSP)},
year = {2007},
location = {Chania, Crete, Greece},
url = {http://www.mmsp2007.org/, http://infoscience.epfl.ch/getfile.py?recid=114221&mode=best},
keywords = {Report_VII, IM2.MPR, LTS5},
abstract = {We present a feature selection method based on information theoretic measures, targeted at multimodal signal processing, showing how we can quantitatively assess the relevance of features from different modalities. We are able to find the features with the highest amount of information relevant for the recognition task, and at the same having minimal redundancy. Our application is audio-visual speech recognition, and in particular selecting relevant visual features. Experimental results show that our method outperforms other feature selection algorithms from the literature by improving recognition accuracy even with a significantly reduced number of features.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/114221},
oai-id = {oai:infoscience.epfl.ch:114221},
oai-set = {conf; fulltext},
review = {REVIEWED},
status = {PUBLISHED},
unit = {LTS}
}

@book{Popescu-Belis_SPRINGER-2_2008,
title = {Machine learning for multimodal interaction v},
author = {Popescu-Belis, A.  and Stiefelhagen, R. },
booktitle = {Machine Learning for Multimodal Interaction (4th International Workshop, MLMI 2008, Utrecht, The Netherlands, September 2008, Proceedings)},
series = {LNCS},
year = {2008},
volume = {5237},
publisher = {Springer-Verlag},
isbn = {978-3-540-85852-2},
keywords = {IM2.MPR, Report_VII}
}

@inproceedings{grossmann:eccv:2008,
title = {Calibration from statistical properties of the visual world},
author = {Grossmann, E.  and Gaspar, J. -A.  and Orabona, F. },
crossref = {grossmann:rr08-63},
booktitle = {European Conf. on Computer Vision},
year = {2008},
note = {IDIAP-RR 08-63},
keywords = {IM2.VP, Report_VIII},
abstract = {What does a blind entity need in order to determine the geometry of the set of photocells that it carries through a changing lightfield? In this paper, we show that very crude knowledge of some statistical properties of the environment is sufficient for this task. We show that some dissimilarity measures between pairs of signals produced by photocells are strongly related to the angular separation between the photocells. Based on real-world data, we model this relation quantitatively, using dissimilarity measures based on the correlation and conditional entropy. We show that this model allows to estimate the angular separation from the dissimilarity. Although the resulting estimators are not very accurate, they maintain their performance throughout different visual environments, suggesting that the model encodes a very general property of our visual world. Finally, leveraging this method to estimate angles from signal pairs, we show how distance geometry techniques allow to recover the complete sensor geometry.},
ipdmembership = {vision},
projects = {Idiap},
pdf = {ftp://ftp.idiap.ch/pub/papers/2008/grossmann-eccv-2008.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2008/grossmann-eccv-2008.ps.gz}
}

@inproceedings{Ba:ICME:2008,
title = {Visual focus of attention estimation from head pose posterior probability distributions},
author = {Ba, S.  and Odobez, J. -M. },
booktitle = {IEEE Proc. Int. Conf. on Multimedia and Expo (ICME)},
year = {2008},
keywords = {Report_VII, IM2.VP},
abstract = {We address the problem of recognizing the visual focus of attention (VFOA) of meeting participants from their head pose and contextual cues. The main contribution of the paper is the use of a head pose posterior distribution as a representation of the head pose information contained in the image data. This posterior encodes the probabilities of the different head poses given the image data, and constitute therefore a richer representation of the data than the mean or the mode of this distribution, as done in all previous work. These observations are exploited in a joint interaction model of all meeting participants pose observations, VFOAs, speaking status and of environmental contextual cues. Numerical experiments on a public database of 4 meetings of 22min on average show that this change of representation allows for a 5.4\% gain with respect to the standard approach using head pose as observation.}
}

@techreport{ullal:rr06-46,
title = {Audio coding based on long temporal segments: experiments with quantization of excitation signal},
author = {Ullal, V.  and Motlicek, P. },
year = {2006},
type = {IDIAP-RR},
number = {46},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
abstract = {In this paper, we describe additional experiments based on a novel audio coding technique that uses an autoregressive model to approximate an audio signal's Hilbert envelope. This technique is performed over long segments (1000 ms) in critical-band-sized sub-bands. We have performed a series of experiments to find more efficient methods of quantizing the frequency components of the Hilbert carrier, which is the excitation found in the temporal audio signal. When using linear quantization, it was found that allocating 5 bits for transmitting the Hilbert carrier every 200 ms was sufficient. Other techniques, such as quantizing the first derivative of phase and using an iterative adaptive threshold, were examined.},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/ullal-idiap-rr-06-46.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/ullal-idiap-rr-06-46.ps.gz}
}

@inproceedings{Duffner_BMVC_2009,
title = {Dynamic Partitioned Sampling For Tracking With Discriminative Features},
author = {Duffner, S.  and Odobez, J. -M.  and Ricci, E. },
booktitle = {Proceedings of the British Maschine Vision Conference},
year = {2009},
location = {London},
keywords = {IM2.VP, Report_VIII},
abstract = {We present a multi-cue fusion method for tracking with particle filters which relies on a novel hierarchical sampling strategy. Similarly to previous works, it tackles the problem of tracking in a relatively high-dimensional state space by dividing such a space into partitions, each one corresponding to a single cue, and sampling from them in a hierarchical manner. However, unlike other approaches, the order of partitions is not fixed a priori but changes dynamically depending on the reliability of each cue, i.e. more reliable cues are sampled first. We call this approach Dynamic Partitioned Sampling (DPS). The reliability of each cue is measured in terms of its ability to discriminate the object with respect to the background, where the background is not described by a fixed model or by random patches but is represented by a set of informative "background particles" which are tracked in order to be as similar as possible to the object. The effectiveness of this general framework is demonstrated on the specific problem of head tracking with three different cues: colour, edge and contours. Experimental results prove the robustness of our algorithm in several challenging video sequences.},
projects = {Idiap,
TA2},
}

@inproceedings{riesen07bipartite,
title = {Bipartite graph matching for computing the edit distance of graphs},
author = {Riesen, K.  and Neuhaus, M.  and Bunke, H. },
editor = {Escolano, F.  and Vento, M. },
booktitle = {Graph-Based Representations in Pattern Recognition},
series = {Lecture Notes in Computer Science},
year = {2007},
volume = {4538},
pages = {1--12},
publisher = {Springer},
keywords = {Report_VI, IM2.ACP},
peer = {yes}
}

@inproceedings{Drygajlo5,
title = {Promoting diversity in gaussian mixture ensembles: an application to signature verification},
author = {Richiardi, J.  and Drygajlo, A.  and Todesco, L. },
journal = {Lecture Notes in Computer Science 5372, Biometrics and Identity Management},
year = {2008},
edition = {B. Schouten, N. Juul, A. Drygajlo, M. Tistarelli, (Eds.)},
pages = {140--149},
publisher = {Springer},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{Ricci:ICIP:2009,
title = {Real-time simultaneous head tracking and pose estimation},
author = {Ricci, E.  and Odobez, J. -M. },
booktitle = {IEEE International Conference on Image Processing (ICIP)},
year = {2009},
keywords = {IM2.VP, Report_VIII}
}

@book{millan:2006:mit-book,
title = {Towards brain-computer interfacing},
author = {Dornhege, G.  and del R. Millán, J.  and Hinterberger, T.  and McFarland, D.  and Müller, K. -R. },
editor = {Dornhege, G.  and del R. Millán, J.  and Hinterberger, T.  and McFarland, D.  and Müller, K. -R. },
year = {2007},
publisher = {The MIT Press},
keywords = {Report_VI, IM2.BMI},
ipdmembership = {learning}
}

@article{vinyals08a,
title = {Live speaker identification in meetings: "who is speaking now?"},
author = {Vinyals, O.  and Friedland, G. },
journal = {Technical Report TR-08-001, International Computer Science Institute, Berkeley, CA},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@article{vinyals08b,
title = {Towards semantic analysis of conversations: a system for the live identification of speakers in meetings},
author = {Vinyals, O.  and Friedland, G. },
journal = {to appear in Proceedings of IEEE International Conference on Semantic Computing, Santa Clara, CA},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@article{vinyals08c,
title = {Modulation spectrogram features for speaker diarization},
author = {Vinyals, O.  and Friedland, G. },
journal = {to appear in proceedings of Interspeech 2008, Brisbane, Australia},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@inproceedings{Frapolli2007,
title = {Dynamic rules: towards interactive games intelligence},
author = {Frapolli, F.  and Hirsbrunner, B.  and Lalanne, D. },
booktitle = {Tangible Play: Research and Design for Tangible and Tabletop Games. Workshop at the 2007 Intelligent User Interfaces Conference (IUI'07)},
year = {2007},
pages = {29--32},
keywords = {Report_VI, IM2.HMI}
}

@article{anguera07c,
title = {Acoustic Beamforming for Speaker Diarization of Meetings},
author = {Anguera, X.  and Wooters, C.  and Hernando, J. },
journal = {to appear in IEEE Transactions on Audio, Speech and Language Processing},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@incollection{millan:2006:mit-idiap,
title = {The idiap brain-computer interface: an asynchronous multi-class approach},
author = {Millán, J. del R.  and Ferrez, P. W.  and Buttfield, A. },
editor = {Dornhege, G.  and Millán, J. del R.  and Hinterberger, T.  and McFarland, D.  and Müller, K. -R. },
booktitle = {Towards Brain-Computer Interfacing},
year = {2007},
publisher = {The MIT Press},
keywords = {IM2.BCI, Report_VII},
abstract = {In this paper we give an overview of our work on a self-pace asynchronous BCI that responds every 0.5 seconds. A statistical Gaussian classifier tries to recognize three different mental tasks; it may also respond unknown for uncertain samples as the classifier has incorporated statistical rejection criteria. We report our experience with different subjects. We also describe three brain-actuated applications we have developed: a virtual keyboard, a brain game, and a mobile robot (emulating a motorized wheelchair). Finally, we discuss current research directions we are pursuing in order to improve the performance and robustness of our BCI system, especially for real-time control of brain-actuated robots.}
}

@article{stoll07,
title = {Speaker Recognition Via Nonlinear Discriminant Features},
author = {Stoll, L.  and Frankel, J.  and Mirghafori, N. },
journal = {Proceedings of NOLISP, Paris, France,},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@phdthesis{Ketabdar_THESIS_2008,
title = {Enhancing posterior based speech recognition systems},
author = {Ketabdar, H. },
year = {2008},
school = {Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {Th\ese Ecole polytechnique f\'ed\'erale de Lausanne EPFL, no 4218 (2008), Facult\'e des sciences et techniques de l'ing\'enieur STI, Section de g\'enie \'electrique et \'electronique, Institut de g\'enie \'electrique et \'electronique IEL (Laboratoire de l'IDIAP LIDIAP). Dir.: Herv\'e Bourlard},
keywords = {IM2.AP, Report_VIII},
abstract = {The use of local phoneme posterior probabilities has been increasingly explored for improving speech recognition systems. Hybrid hidden Markov model / artificial neural network (HMM/ANN) and Tandem are the most successful examples of such systems. In this thesis, we present a principled framework for enhancing the estimation of local posteriors, by integrating phonetic and lexical knowledge, as well as long contextual information. This framework allows for hierarchical estimation, integration and use of local posteriors from the phoneme up to the word level. We propose two approaches for enhancing the posteriors. In the first approach, phoneme posteriors estimated with an ANN (particularly multi-layer Perceptron - MLP) are used as emission probabilities in HMM forward-backward recursions. This yields new enhanced posterior estimates integrating HMM topological constraints (encoding specific phonetic and lexical knowledge), and long context. In the second approach, a temporal context of the regular MLP posteriors is post-processed by a secondary MLP, in order to learn inter and intra dependencies among the phoneme posteriors. The learned knowledge is integrated in the posterior estimation during the inference (forward pass) of the second MLP, resulting in enhanced posteriors. The use of resulting local enhanced posteriors is investigated in a wide range of posterior based speech recognition systems (e.g. Tandem and hybrid HMM/ANN), as a replacement or in combination with the regular MLP posteriors. The enhanced posteriors consistently outperform the regular posteriors in different applications over small and large vocabulary databases.},
}

@incollection{pop09-kipp,
title = {Accessing a large multimodal corpus using an automatic content linking device},
author = {Popescu-Belis, A.  and Carletta, J.  and Kilgour, J.  and Poller, P. },
editor = {Kipp, M.  and Martin, J. -C.  and Paggio, P.  and Heylen, D. },
booktitle = {Multimodal Corpora},
series = {LNAI},
year = {2009},
publisher = {Springer-Verlag},
keywords = {IM2.MPR, IM2.DMA, IM2.HMI, Report_VIII}
}

@techreport{Garner_Idiap-RR-08-2009,
title = {A MAP Approach to Noise Compensation of Speech},
author = {Garner, P. N. },
year = {2009},
type = {Idiap-RR},
number = {Idiap-RR-08-2009},
institution = {Idiap},
keywords = {IM2.AP, Report_VIII},
abstract = {We show that estimation of parameters for the popular Gaussian model of speech in noise can be regularised in a Bayesian sense by use of simple prior distributions. For two example prior distributions, we show that the marginal distribution of the uncorrupted speech is non-Gaussian, but the parameter estimates themselves have tractable solutions. Speech recognition experiments serve to suggest values for hyper-parameters, and demonstrate that the theory is practically applicable.},
projects = {IM2},
}

@inproceedings{Chavarriaga07,
title = {To Err Is Human: Learning from Error Potentials in Brain-Computer Interfaces},
author = {Chavarriaga, R.  and Ferrez, P. W.  and Millán, J. del R. },
booktitle = {1st International Conference on Cognitive Neurodynamics (ICCN 2007)},
year = {2007},
keywords = {IM2.BMI, Report_VIII}
}

@article{Bruno08,
title = {Design of multimodal dissimilarity spaces for retrieval of multimedia documents},
author = {Bruno, E.  and Moüenne-Loccoz, N.  and Marchand-Maillet, S. },
journal = {To appear in IEEE Transactions on Pattern Analysis and Machine Intelligence},
year = {2008},
keywords = {Report_VII, IM2.MCA}
}

@inproceedings{soleymani2008:mlmi2008,
title = {Valence-arousal representation of movie scenes based on multimedia content analysis and user's physiological emotional responses},
author = {Soleymani, M.  and Chanel, G.  and Kierkels, J.  and Pun, T. },
booktitle = {MLMI 2008, 5th Joint Workshop on Machine Learning and Multimodal Interaction},
year = {2008},
note = {(PhD student poster session, with extended abstract)},
keywords = {Report_VII, IM2.MCA}
}

@article{zhao08,
title = {Multi-stream spectro-temporal features for robust speech recognition},
author = {Zhao, S.  and Morgan, N. },
journal = {to appear in Proceedings of Interspeech 2008, Brisbane, Australia},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@inproceedings{Carincotte_IEEE-ITSC_2008,
title = {Understanding Metro Station Usage using Closed Circuit Television Cameras Analysis},
author = {Carincotte, C.  and Naturel, X.  and Hick, M.  and Odobez, J. -M.  and Yao, J.  and Bastide, A.  and Corbucci, B. },
crossref = {naturel:rr08-38},
booktitle = {11th International IEEE Conference on Intelligent Transportation Systems (ITSC)},
year = {2008},
location = {Bejing},
keywords = {IM2.VP, Report_VIII},
abstract = {In this paper, we propose to show how video data available in standard CCTV transportation systems can represent a useful source of information for transportation infrastructure management, optimization and planning if adequately analyzed (e.g. to facilitate equipment usage understanding, to ease diagnostic and planning for system managers). More precisely, we present two algorithms allowing to estimate the number of people in a camera view and to measure the platform time-occupancy by trains. A statistical analysis of the results of each algorithm provide interesting insights regarding station usage. It is also shown that combining information from the algorithms in different views provide a finer understanding of the station usage. An end-user point of view confirms the interest of the proposed analysis.},
projects = {Idiap,
CARETAKER},
}

@inproceedings{beekhof1,
title = {Multi-class classifiers based on binary classifiers: performance, efficiency, and minimum coding matrix distances},
author = {Beekhof, F.  and Voloshynovskiy, S.  and Koval, O.  and Holotyak, T. },
booktitle = {MLSP 2009},
year = {2009},
keywords = {IM2.MPR, Report_VIII}
}

@article{smith:tpami:2008,
title = {Tracking the visual focus of attention for a varying number of wandering people},
author = {Smith, K.  and Ba, S.  and Gatica-Perez, D.  and Odobez, J. -M. },
journal = {IEEE Trans. on Pattern Analysis and Machine Intelligence,},
year = {2008},
volume = {30},
number = {7},
pages = {1212--1229},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{Garg01,
title = {Clusterrank: a graph based method for meeting summarization},
author = {Garg, N.  and Favre, B.  and Riedhammer, K.  and Hakkani-Tur, D. },
booktitle = {10th International Conference of the International Speech Communication Association, Brighton, UK},
year = {2009},
keywords = {IM2.AP, Report_VIII}
}

@techreport{cuendet:rr06-64,
title = {Model adaptation for sentence unit segmentation from speech},
author = {Cuendet, S. },
year = {2006},
type = {IDIAP-RR},
number = {64},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
abstract = {The sentence segmentation task is a classification task that aims at inserting sentence boundaries in a sequence of words. One of the applications of sentence segmentation is to detect the sentence boundaries in the sequence of words that is output by an automatic speech recognition system (ASR). The purpose of correctly finding the sentence boundaries in ASR transcriptions is to make it possible to use further processing tasks, such as automatic summarization, machine translation, and information extraction. Being a classification task, sentence segmentation requires training data. To reduce the labor-intensive labeling task, available labeled data can be used to train the classifier. The high variability of speech among the various speech styles makes it inefficient to use the classifier from one speech style (designated as out-of-domain) to detect sentence boundaries on another speech style (in-domain) and thus, makes it necessary for one classifier to be adapted before it is used on another speech style. In this work, we first justify the need for adapting data among the broadcast news, conversational telephone and meeting speech styles. We then propose methods to adapt sentence segmentation models trained on conversational telephone speech to meeting conversations style. Our results show that using the model adapted from the telephone conversations, instead of the model trained only on meetings conversation style, significantly improves the performance of the sentence segmentation. Moreover, this improvement holds independently from the amount of in-domain data used. In addition, we also study the differences between speech styles, with statistical measures and by examining the performances of various subsets of features. Focusing on broadcast news and meeting speech style, we show that on the meeting speech style, lexical features are more correlated with the sentence boundaries than the prosodic features, whereas it is the contrary on the broadcast news. Furthermore, we observe that prosodic features are more independent from the speech style than lexical features.},
ipdmembership = {cuendet speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/cuendet-idiap-rr-06-64.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/cuendet-idiap-rr-06-64.ps.gz}
}

@inproceedings{BrunoDumas20082,
title = {Strengths and weaknesses of software architectures for the rapid creation of tangible and multimodal interfaces},
author = {Dumas, B.  and Lalanne, D.  and Guinard, D.  and Koenig, R.  and Ingold, R. },
booktitle = {Proceedings of 2nd international conference on Tangible and Embedded Interaction (TEI 2008)},
year = {2008},
pages = {47--54},
keywords = {IM2.HMI, Report_VIII}
}

@techreport{vinciarelli:rr06-56,
title = {Assessing the effectiveness of slides as a mean to improve the automatic transcription of oral presentations},
author = {{A. Peregoudov}  and Vinciarelli, A.  and Bourlard, H. },
year = {2006},
type = {IDIAP-RR},
number = {56},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {Report_VI, IM2.AP.MCA, joint publication},
abstract = {This paper presents experiments aiming at improving the automatic transcription of oral presentations through the inclusion of the slides in the recognition process. The experiments are performed over a data set of around three hours of material ( 33 kwords and 270 slides) and are based on an approach trying to maximize the similarity between the recognizer output and the content of the slides. The results show that the upper bound to the Word Error Rate (WER) reduction is 1.7\% (obtained by transcribing correctly all words co-occurring in both slides and speech), but that our approach does not produce statistically significant improvements. Results analysis seems to suggest that such results do not depend on the similarity maximization approach, but on the statistical characteristics of the language.},
ipdmembership = {vision},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/vinciarelli-idiap-rr-06-56.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/vinciarelli-idiap-rr-06-56.ps.gz}
}

@article{Pun2,
title = {Handling temporal heterogeneous data for content-based management of large video collections},
author = {Janvier, B.  and Bruno, E.  and Marchand-Maillet, S.  and Pun, T. },
journal = {Multimedia Tools and Applications},
year = {2006},
volume = {30},
pages = {273--288},
keywords = {Report_VI, IM2.MCA}
}

@techreport{hari-rr-06-57,
title = {Unsupervised speech/non-speech detection for automatic speech recognition in meeting rooms},
author = {Maganti, H. K.  and Motlicek, P.  and Gatica-Perez, D. },
year = {2006},
type = {IDIAP-RR},
number = {57},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
abstract = {The goal of this work is to provide robust and accurate speech detection for automatic speech recognition (ASR) in meeting room settings. The solution is based on computing long-term modulation spectrum, and examining specific frequency range for dominant speech components to classify speech and non-speech signals for a given audio signal. Manually segmented speech segments, short-term energy, short-term energy and zero-crossing based segmentation techniques, and a recently proposed Multi Layer Perceptron (MLP) classifier system are tested for comparison purposes. Speech recognition evaluations of the segmentation methods are performed on a standard database and tested in conditions where the signal-to-noise ratio (SNR) varies considerably, as in the cases of close-talking headset, lapel, distant microphone array output, and distant microphone. The results reveal that the proposed method is more reliable and less sensitive to mode of signal acquisition and unforeseen conditions.},
ipdmembership = {speech},
language = {English},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/rr06-57.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/rr06-57.ps.gz}
}

@inproceedings{BrunoDumas20084,
title = {Prototyping multimodal interfaces with smuiml modeling language},
author = {Dumas, B.  and Lalanne, D.  and Ingold, R. },
booktitle = {Proceedings of CHI 2008 Workshop on UIDLs for Next Generation User Interfaces (CHI 2008 workshop)},
year = {2008},
pages = {63--66},
keywords = {IM2.HMI, Report_VIII}
}

@techreport{berclaz-et-al-rr2009,
title = {Multiple object tracking using flow linear programming},
author = {Berclaz, J.  and Fleuret, F.  and Fua, P. },
year = {2009},
number = {10-2009},
institution = {IDIAP Research Institute},
keywords = {IM2.VP, Report_VIII}
}

@article{shriberg08,
title = {Higher level features in speaker recognition},
author = {Shriberg, E. },
journal = {in C. Muller (Ed.) Speaker Classification I. Springer-Verlag, New York},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@inproceedings{JayagopiICMI08,
title = {Predicting two facets of social verticality in meetings from five-minute time slices and nonverbal cues},
author = {Jayagopi, D. },
booktitle = {Proc. ICMI},
year = {2008},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{JayagopiICMI09,
title = {Discovering group nonverbal conversational patterns with topics},
author = {Jayagopi, D.  and Gatica-Perez, D. },
booktitle = {accepted for publication in Proc. ICMI-MLMI},
year = {2009},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{2008-deville-cvavi,
title = {Guiding the focus of attention of blind people with visual saliency},
author = {Deville, B.  and Bologna, G.  and Vinckenbosch, M.  and Pun, T. },
booktitle = {Workshop on Computer Vision Applications for the Visually Impaired (CVAVI 08), Satellite Workshop of theEuropean Conference on Computer Vision (ECCV 2008), Marseille, France, October 18},
year = {2008},
keywords = {IM2.MCA, Report_VIII}
}

@inproceedings{Motlicek_INTERSPEECH2009-2_2009,
title = {Automatic Out-of-Language Detection Based on Confidence Measures Derived fromLVCSR Word and Phone Lattices},
author = {Motlicek, P. },
crossref = {Motlicek_INTERSPEECH2009_2009},
booktitle = {10thAnnual Conference of the International Speech Communication Association},
series = {2009 ISCA},
year = {2009},
pages = {1215--1218},
organization = {ISCA},
location = {Brighton, England},
keywords = {IM2.AP, Report_VIII},
abstract = {Confidence Measures (CMs) estimated from Large Vocabulary Continuous Speech Recognition (LVCSR) outputs are commonly used metrics to detect incorrectly recognized words. In this paper, we propose to exploit CMs derived from frame-based word and phone posteriors to detect speech segments containing pronunciations from non-target (alien) languages. The LVCSR system used is built for English, which is the target language, with medium-size recognition vocabulary (5k words). The efficiency of detection is tested on a set comprising speech from three different languages (English, German, Czech). Results achieved indicate that employment of specific temporal context (integrated in the word or phone level) significantly increases the detection accuracies. Furthermore, we show that combination of several CMs can also improve the efficiency of detection.},
projects = {Idiap,
TA2,
AMIDA},
}

@inproceedings{LTS-CONF-2008-111,
title = {An Architecture for TV Content Distributed Search and Retrieval Using the MPEG Query Format (MPQF)},
author = {Tous, R.  and Carreras, A.  and Delgado, J.  and Cordara, G.  and Gianluca, F.  and Peig, E.  and Dufaux, F.  and Galinski, G. },
booktitle = {International Workshop on Ambient Media Delivery and Interactive Television (AMDIT 2008)},
year = {2008},
url = {http://infoscience.epfl.ch/getfile.py?recid=116458&mode=best},
keywords = {Report_VII, IM2.MCA,information retrieval; multimedia retrieval; distributed information retrieval; MPQF; MPEG Query Format; TV},
abstract = {Traditional broadcasting of TV contents begins to coexist with new models of user aware content delivery. The definition of interoperable interfaces for precise content search and retrieval between the different involved parties is a requirement for the deployment of the new audiovisual distribution services. This paper presents the design of an architecture based on the MPEG Query Format (MPQF) for providing the necessary interoperability to deploy distributed audiovisual content search and retrieval networks between content producers, distributors, aggregators and consumer devices. A service-oriented architecture based on Web Services technology is defined. This paper also presents how the architecture can be applied to a real scenario, the XAC (Xarxa IP Audiovisual de Catalunya, Audiovisual IP Network of Catalonia). As far as we know, this is the first paper to apply MPQF to TV Content Distributed Search and Retrieval.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/116458},
oai-id = {oai:infoscience.epfl.ch:116458},
oai-set = {conf; fulltext},
review = {REVIEWED},
status = {PUBLISHED},
unit = {LTS MMSPL}
}

@inproceedings{Hung:ICMI:2008,
title = {Investigating automatic dominance estimation in groups from visual attention and speaking activity},
author = {Hung, H.  and Jayagopi, D.  and Ba, S.  and Odobez, J. -M.  and Gatica-Perez, D. },
booktitle = {International Conference on Multimodal Interfaces (ICMI)},
year = {2008},
keywords = {IM2.VP, Report_VIII}
}

@inproceedings{ACMMIR2007,
title = {Combining multimodal preferences for multimedia information retrieval},
author = {Bruno, E.  and Kludas, J.  and Marchand-Maillet, S. },
booktitle = {ACM SIGMM - International Workshop on Multimedia Information Retrieval},
year = {2007},
keywords = {Report_VI, IM2.MCA}
}

@inproceedings{ACMMIR2006,
title = {Dual diffusion model of spreading activation for content-based image retrieval},
author = {Kosinov, S.  and Marchand-Maillet, S.  and Kozintsev, I.  and Dulong, C.  and Pun, T. },
booktitle = {8th ACM SIGMM - International Workshop on Multimedia Information Retrieval},
year = {2006},
address = {Santa Barbara, CA, USA},
keywords = {Report_VI, IM2.MCA}
}

@article{wooters07,
title = {The ICSI RT07s Speaker Diarization System},
author = {Wooters, C.  and Huijbregts, M. },
journal = {to appear in Lecture Notes in Computer Science},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@article{millan:2007:jnm,
title = {High-resolution eeg techniques for brain-computer interface applications},
author = {Cincotti, F.  and Mattia, D.  and Aloise, F.  and Bufalari, S.  and Astolfi, L.  and Fallani, F. De Vico and Tocci, A.  and Bianchi, L.  and Marciani, M. G.  and Gao, S.  and Millán, J. del R.  and Babiloni, F. },
journal = {Journal of Neuroscience Methods},
year = {2007},
volume = {167},
pages = {31--42},
issn = {0165-0270},
keywords = {IM2.BCI, Report_VII},
abstract = {High-resolution electroencephalographic (HREEG) techniques allow estimation of cortical activity based on non-invasive scalp potential measurements, using appropriate models of volume conduction and of neuroelectrical sources. In this study we propose an application of this body of technologies, originally developed to obtain functional images of the brainü{\i}\^A?\^Afrac1/2s electrical activity, in the context of brainü{\i}\^A?\^Afrac1/2computer interfaces (BCI). Our working hypothesis predicted that, since HREEG pre-processing removes spatial correlation introduced by current conduction in the head structures, by providing the BCI with waveforms that are mostly due to the unmixed activity of a small cortical region, a more reliable classification would be obtained, at least when the activity to detect has a limited generator, which is the case in motor related tasks. HREEG techniques employed in this study rely on (i) individual head models derived from anatomical magnetic resonance images, (ii) distributed source model, composed of a layer of current dipoles, geometrically constrained to the cortical mantle, (iii) depth-weighted minimum L2-norm constraint and Tikhonov regularization for linear inverse problem solution and (iv) estimation of electrical activity in cortical regions of interest corresponding to relevant Brodmann areas. Six subjects were trained to learn self modulation of sensorimotor EEG rhythms, related to the imagination of limb movements. Off-line EEG data was used to estimate waveforms of cortical activity (cortical current density, CCD) on selected regions of interest. CCD waveforms were fed into the BCI computational pipeline as an alternative to raw EEG signals; spectral features are evaluated through statistical tests (r2 analysis), to quantify their reliability for BCI control. These results are compared, within subjects, to analogous results obtained without HREEG techniques. The processing procedure was designed in such a way that computations could be split into a setup phase (which includes most of the computational burden) and the actual EEG processing phase, which was limited to a single matrix multiplication. This separation allowed to make the procedure suitable for on-line utilization, and a pilot experiment was performed. Results show that lateralization of electrical activity, which is expected to be contralateral to the imagined movement, is more evident on the estimated CCDs than in the scalp potentials. CCDs produce a pattern of relevant spectral features that is more spatially focused, and has a higher statistical significance (EEG: 0.20ü{\i}\^A?\^Afrac1/20.114 S.D.; CCD: 0.55ü{\i}\^A?\^Afrac1/20.16 S.D.; p=10-5). A pilot experiment showed that a trained subject could utilize voluntary modulation of estimated CCDs for accurate (eight targets) on-line control of a cursor. This study showed that it is practically feasible to utilize HREEG techniques for on-line operation of a BCI system; off-line analysis suggests that accuracy of BCI control is enhanced by the proposed method.}
}

@techreport{hemptinne:rr06-69,
title = {Master thesis: integration of the harmonic plus noise model (hnm) into the hidden markov model-based speech synthesis system (hts)},
author = {Hemptinne, C. },
year = {2006},
type = {IDIAP-RR},
number = {69},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/hemptinne-idiap-rr-06-69.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/hemptinne-idiap-rr-06-69.ps.gz}
}

@article{Pun1,
title = {Brain-computer interaction research at the computer vision and multimedia laboratory, university of geneva},
author = {Pun, T.  and Alecu, T. I.  and Chanel, G.  and Kronegg, J.  and Voloshynovskiy, S. },
journal = {IEEE Trans. Neural Systems and Rehabilitation Engineering, Special Issue on Brain-Computer Interaction},
year = {2006},
volume = {14},
number = {2},
pages = {210--213},
keywords = {Report_VI, IM2.MPR}
}

@techreport{mariethoz:rr06-70,
title = {Discrmininant models for text-independent speaker verification},
author = {Mari\'ethoz, J. },
year = {2006},
type = {IDIAP-RR},
number = {70},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
abstract = {This thesis addresses text-independent speaker verification from a machine learning point of view. We use the machine learning framework to better define the problem and to develop new unbiased performance measures and statistical tests to compare objectively new approaches. We propose a new interpretation of the state-of-the-art Gaussian Mixture Model based system and show that they are discriminant and equivalent to a mixture of linear classifiers. A general framework for score normalization is also given for both probability and non-probability based models. With this new framework we better show the hypotheses made for the well known Z- and T- score normalization techniques. Several uses of discriminant models are then proposed. In particular, we develop a new sequence kernel for Support Vector Machines that generalizes an other sequence kernel found in the literature. If the latter is limited to a polynomial form the former allows the use of infinite space kernels such as Radial Basis Functions. A variant of this kernel that finds the best match for each frame of the sequence to be compared, actually outperforms the state-of-the-art systems. As our new sequence kernel is computationally costly for long sequences, a clustering technique is proposed for reducing the complexity. We also address in this thesis some problems specific to speaker verification such as the fact that the classes are highly unbalanced. And the use of a specific intra- and inter-class distance distribution is proposed by modifying the kernel in order to assume a Gaussian noise distribution over negative examples. Even if this approach misses some theoretical justification, it gives very good empirical results and opens a new research direction.},
ipdmembership = {learning},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/mariethoz-idiap-rr-06-70.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/mariethoz-idiap-rr-06-70.ps.gz}
}

@inproceedings{Jean-LucBloechle20097,
title = {Ocd: an optimized and canonical document format},
author = {Bloechle, J. -L.  and Lalanne, D.  and Ingold, R. },
booktitle = {Proceedings of 10th IEEE International Conference on Document Analysis and Recognition (ICDAR 2009)},
year = {2009},
pages = {236--240},
keywords = {IM2.DMA, Report_VIII}
}

@inproceedings{Chanel:MindTrek08,
title = {boredom, engagement and anxiety as indicators for adaptation to difficulty in games},
author = {Chanel, G.  and Rebetez, C.  and Betrancourt, M.  and Pun, T. },
booktitle = {ACM Mindtrek conference},
year = {2008},
keywords = {IM2.MCA, Report_VIII},
owner = {Chanel},
vgclass = {refpap},
vgproject = {bmi}
}

@article{Lalanne20079,
title = {Supporting human memory with interactive systems},
author = {Lalanne, D.  and van den Hoven, E. },
year = {2007},
pages = {215--216},
keywords = {Report_VII, IM2.HMI}
}

@inproceedings{Favre_ACMMULTIMEDIA_2008,
title = {Role recognition for meeting participants: an approach based on lexical information and social network analysis},
author = {Favre, S.  and Salamin, H.  and Vinciarelli, A.  and Hakkani-Tur, D.  and Garg, N. },
crossref = {salamin:rr08-57},
booktitle = {ACM International Conference on Multimedia},
year = {2008},
keywords = {IM2.MCA, Report_VII},
abstract = {This paper presents experiments on the automatic recognition of roles in meetings. The proposed approach combines two sources of information: the lexical choices made by people playing different roles on one hand, and the Social Networks describing the interactions between the meeting participants on the other hand. Both sources lead to role recognition results significantly higher than chance when used separately, but the best results are obtained with their combination. Preliminary experiments obtained over a corpus of 138 meeting recordings (over 45 hours of material) show that around 70\% of the time is labeled correctly in terms of role.}
}

@book{SchouDryg1,
title = {Biometrics and identity management},
author = {Schouten, B.  and Juul, N.  and Drygajlo, A.  and Tistarelli, M. },
year = {2008},
publisher = {Springer},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{Ketabdar2008,
title = {Hierarchical integration of phonetic and lexical knowledge in phone posterior estimation},
author = {Ketabdar, H.  and Bourlard, H. },
booktitle = {International Conference on Acoustics, Speech, and Signal Processing},
year = {2008},
keywords = {Report_VII, IM2.AP},
abstract = {Phone posteriors has recently quite often used (as additional features or as local scores) to improve state-of-the-art automatic speech recognition (ASR) systems. Usually, better phone posterior estimates yield better ASR performance. In the present paper we present some initial, yet promising, work towards hierarchically improving these phone posteriors, by implicitly integrating phonetic and lexical knowledge. In the approach investigated here, phone posteriors estimated with a multilayer perceptron (MLP) and short (9 frames) temporal context, are used as input to a second MLP, spanning a longer temporal context (e.g. 19 frames of posteriors) and trained to refine the phone posterior estimates. The rationale behind this is that at the output of every MLP, the information stream is getting simpler (converging to a sequence of binary posterior vectors), and can thus be further processed (using a simpler classifier) by looking at a larger temporal window. Longer term dependencies can be interpreted as phonetic, sub-lexical and lexical knowledge. The resulting enhanced posteriors can then be used for phone and word recognition, in the same way as regular phone posteriors, in hybrid HMM/ANN or Tandem systems. The proposed method has been tested on TIMIT, OGI Numbers and Conversational Telephone Speech (CTS) databases, always resulting in consistent and significant improvements in both phone and word recognition rates.}
}

@inproceedings{Voloshynovskiy:2008:MMSec,
title = {Multimodal authentication based on random projections and distributed coding},
author = {Voloshynovskiy, S.  and Koval, O.  and Pun, T. },
booktitle = {Proceedings of the 10th ACM Workshop on Multimedia & Security},
year = {2008},
keywords = {Report_VII, IM2.MPR}
}

@inproceedings{raducanu09,
title = {You are fired! Nonverbal role analysis in competitive meetings},
author = {Raducanu, B.  and Gatica-Perez, D. },
booktitle = {Proc. ICASSP, Taiwan},
year = {2009},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{grangier:2007:eurospeech,
title = {Learning the inter-frame distance for discriminative template-based keyword detection},
author = {Grangier, D.  and Bengio, S. },
booktitle = {International Conference on Speech Communication and Technology (INTERSPEECH)},
year = {2007},
keywords = {Report_VI, IM2.MPR},
abstract = {This paper proposes a discriminative approach to template-based keyword detection. We introduce a method to learn the distance used to compare acoustic frames, a crucial element for template matching approaches. The proposed algorithm estimates the distance from data, with the objective to produce a detector maximizing the Area Under the receiver operating Curve (AUC), i.e. the standard evaluation measure for the keyword detection problem. The experiments performed over a large corpus, SpeechDatII, suggest that our model is effective compared to an HMM system, e.g. the proposed approach reaches 93.8\% of averaged AUC compared to 87.9\% for the HMM.},
ipdmembership = {Learning},
ipdxref = {techreport:grangier_rr_07-15.bib},
pdf = {ftp://ftp.idiap.ch/pub/reports/2007/grangier_eurospeech07.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2007/grangier_eurospeech07.ps.gz}
}

@inproceedings{eth_biwi_00424,
title = {Retina mosaicing using local features},
author = {Cattin, P. C.  and Bay, H.  and van Gool, L.  and Sz\'ekely, G. },
booktitle = {Medical Image Computing and Computer-Assisted Intervention (MICCAI)},
series = {LNCS},
year = {2006},
volume = {4191},
pages = {185--192},
keywords = {Report_VI, IM2.VP}
}

@article{bruno2009:jmm,
title = {Multimodal preference aggregation for multimedia information retrieval},
author = {Bruno, E.  and Marchand-Maillet, S. },
journal = {To appear in Journal of Multimedia},
year = {2009},
url = {http://viper.unige.ch/documents/pdf/bruno2009-jmm.pdf},
keywords = {IM2.MCA, Report_VIII}
}

@article{hwang07,
title = {Building a Highly Accurate Mandarin Speech Recognizer},
author = {Hwang, M. -Y.  and Peng, G.  and Wang, W.  and Faria, A.  and Heidel, A.  and Ostendorf, M. },
journal = {IEEE workshop on Automatic Speech Recognition and Understanding (ASRU 07), Kyoto},
year = {2007},
keywords = {Report_VII, IM2.AP}
}

@inproceedings{Tommasi_CLEF_2008,
title = {Cue Integration for Medical Image Annotation},
author = {Tommasi, T.  and Orabona, F.  and Caputo, B. },
booktitle = {Advances in Multilingual and Multimodal Information Retrieval: 8th Workshop of the Cross-Language Evaluation Forum, CLEF 2007, Budapest, Hungary, September 19-21, 2007, Revised Selected Papers},
series = {LNCS},
year = {2008},
publisher = {Springer-Verlag},
keywords = {IM2.VP, IM2.MPR, Report_VIII},
abstract = {This paper presents the algorithms and results of our par- ticipation to the image annotation task of ImageCLEFmed 2007. We proposed a multi-cue approach where images are represented both by global and local descriptors. These cues are combined following two SVM- based strategies. The first algorithm, called Discriminative Accumulation Scheme (DAS), trains an SVM for each feature, and considers as output of each classifier the distance from the separating hyperplane. The final decision is taken on a linear combination of these distances. The second algorithm, that we call Multi Cue Kernel (MCK), uses a new Mercer kernel which can accept as input different features while keeping them separated. The DAS algorithm obtained a score of 29.9, which ranked fifth among all submissions. The MCK algorithm with the one-vs-all and with the one-vs-one multiclass extensions of SVM scored respec- tively 26.85 and 27.54. These runs ranked first and second among all submissions.},
projects = {Idiap,
EMMA},
}

@inproceedings{Armstrong-5-ISSCO,
title = {Using a wizard of oz as a baseline to determine which system architecture is the best for a spoken language translation system},
author = {Starlander, M. },
booktitle = {Proceedings of Nodalida 2007},
series = {16th Nordic Conference of Computational Linguistics},
year = {2007},
pages = {161--164},
location = {Tartu, Estonia},
keywords = {Report_VI, IM2.HMI, 24-26 may}
}

@inproceedings{hungICMI08,
title = {Investigating automatic dominance estimation in groups from visual attention and speaking activity},
author = {Hung, H.  and Jayagopi, D.  and Ba, S.  and Odobez, J. -M.  and Gatica-Perez, D. },
booktitle = {Proc. ICMI},
year = {2008},
keywords = {IM2.MPR, Report_VIII}
}

@incollection{schlapbach08offline,
title = {Off-line writer identification and verification using gaussian mixture models},
author = {Schlapbach, A.  and Bunke, H. },
editor = {Marinai, S. },
booktitle = {Machine Learning in Document Analysis and Recognition},
year = {2008},
pages = {409--428},
publisher = {Springer},
keywords = {Report_VII, IM2.VP},
peer = {yes}
}

@phdthesis{Paiement_THESIS_2008,
title = {Probabilistic models for music},
author = {Paiement, J. -F. },
year = {2008},
school = {\'Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {Th\ese Ecole polytechnique f\'ed\'erale de Lausanne EPFL, no 4148 (2008), Facult\'e des sciences et techniques de l'ing\'enieur STI, Institut de g\'enie \'electrique et \'electronique IEL (Laboratoire de l'IDIAP LIDIAP). Dir.: Herv\'e Bourlard, Samy Bengio},
keywords = {chord progressions, generative models, machine learning, melodies, music, probabilistic models, IM2.AP, Report_VIII},
abstract = {This thesis proposes to analyse symbolic musical data under a statistical viewpoint, using state-of-the-art machine learning techniques. Our main argument is to show that it is possible to design generative models that are able to predict and to generate music given arbitrary contexts in a genre similar to a training corpus, using a minimal amount of data. For instance, a carefully designed generative model could guess what would be a good accompaniment for a given melody. Conversely, we propose generative models in this thesis that can be sampled to generate realistic melodies given harmonic context. Most computer music research has been devoted so far to the direct modeling of audio data. However, most of the music models today do not consider the musical structure at all. We argue that reliable symbolic music models such a the ones presented in this thesis could dramatically improve the performance of audio algorithms applied in more general contexts. Hence, our main contributions in this thesis are three-fold: We have shown empirically that long term dependencies are present in music data and we provide quantitative measures of such dependencies; We have shown empirically that using domain knowledge allows to capture long term dependencies in music signal better than with standard statistical models for temporal data. We describe many probabilistic models aimed to capture various aspects of symbolic polyphonic music. Such models can be used for music prediction. Moreover, these models can be sampled to generate realistic music sequences; We designed various representations for music that could be used as observations by the proposed probabilistic models.},
projects = {Idiap},
}

@inproceedings{odobez:icme:2007,
title = {A cognitive and unsupervised map adaptation approach to the recognition of the focus of attention from head pose},
author = {Odobez, J. -M.  and Ba, S. },
booktitle = {International Conference on Multi-Media & Expo (ICME07)},
year = {2007},
note = {IDIAP-RR 07-20},
keywords = {Report_VI, IM2.VP},
abstract = {In this paper, the recognition of the visual focus of attention (VFOA) of meeting participants (as defined by their eye gaze direction) from their head pose is addressed. To this end, the head pose observations are modeled using an Hidden Markov Model (HMM) whose hidden states corresponds to the VFOA. The novelties are threefold. First, contrary to previous studies on the topic, in our set-up, the potential VFOA of a person is not restricted to other participants only, but includes environmental targets (a table and a projection screen), which increases the complexity of the task, with more VFOA targets spread in the pan and tilt (as well) gaze space. Second, the HMM parameters are set by exploiting results from the cognitive science on saccadic eye motion, which allows to predict what the head pose should be given an actual gaze target. Third, an unsupervised parameter adaptation step is proposed which accounts for the specific gazing behaviour of each participant. Using a publicly available corpus of 8 meetings featuring 4 persons, we analyze the above methods by evaluating, through objective performance measures, the recognition of the VFOA from head pose information obtained either using a magnetic sensor device or a vision based tracking system.},
ipdmembership = {vision},
ipdxref = {techreport:odobez-idiap-rr-07-20.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/odobez-icme-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/odobez-icme-2007.ps.gz}
}

@inproceedings{Noris:ICCV:2008,
title = {Calibration-free eye gaze direction detection with gaussian processes},
author = {Noris, B.  and Benmachiche, K.  and Billard, A. },
booktitle = {International Conference on Computer Vision Theory and Applications (VISAPP 2008)},
year = {2008},
location = {Funchal, Portugal},
keywords = {IM2.MPR Report_VII},
abstract = {In this paper we present a solution for eye gaze detection from a wireless head mounted camera designed for children aged between 6 months and 18 months. Due to the constraints of working with very young children, the system does not seek to be as accurate as other state-of-the-art eye trackers, however it requires no calibration process from the wearer. Gaussian Process Regression and Support Vector Machines are used to analyse the raw pixel data from the video input and return an estimate of the childs gaze direction. A confidence map is used to determine the accuracy the system can expect for each coordinate on the image. The best accuracy so far obtained by the system is 2.34 ? on adult subjects, tests with children remain to be done.}
}

@article{knox08,
title = {Getting the last laugh: automatic laughter segmentation in meetings},
author = {Knox, M.  and Morgan, N.  and Mirghafori, N. },
journal = {to appear in Proceedings of Interspeech 2008, Brisbane, Australia},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@book{neuhaus07bridging,
title = {Bridging the gap between graph edit distance and kernel machines},
author = {Neuhaus, M.  and Bunke, H. },
series = {Machine Perception and Artificial Intelligence},
year = {2007},
volume = {68},
publisher = {World Scientific},
isbn = {978-981-270-817-5},
keywords = {Report_VII, IM2.VP}
}

@article{apb-lre-2008,
title = {Dimensionality of dialogue act tagsets: an empirical analysis of large corpora},
author = {Popescu-Belis, A. },
journal = {Language Resources and Evaluation},
year = {2008},
volume = {42},
number = {1},
pages = {99--107},
doi = {10.1007/s10579-008-9063-y},
keywords = {Report_VII, IM2.DMA},
abstract = {This article compares one-dimensional and multi-dimensional dialogue act tagsets used for automatic labeling of utterances. The influence of tagset dimensionality on tagging accuracy is first discussed theoretically, then based on empirical data from human and automatic annotations of large scale resources, using four existing tagsets: DAMSL, SWBD-DAMSL, ICSI-MRDA and MALTUS. The Dominant Function Approximation proposes that automatic dialogue act taggers could focus initially on finding the main dialogue function of each utterance, which is empirically acceptable and has significant practical relevance.}
}

@inproceedings{perrin:hri:2008,
title = {A comparative psychophysical and eeg study of different feedback modalities for hri},
author = {Perrin, X.  and Chavarriaga, R.  and Ray, C.  and Siegwart, R.  and Millán, J. del R. },
booktitle = {Human-Robot Interaction (HRI08)},
year = {2008},
keywords = {Report_VII, IM2.BMI, joint publication},
abstract = {This paper presents a comparison between six different ways to convey navigational information provided by a robot to a human. Visual, auditory, and tactile feedback mo-da-li-ties were selected and designed to suggest a direction of travel to a human user, who can then decide if he agrees or not with the robot's proposition. This work builds upon a previous research on a novel semi-autonomous navigation system in which the human supervises an autonomous system, providing corrective monitoring signals whenever necessary.We recorded both qualitative (user impressions based on selected criteria and ranking of their feelings) and quantitative (response time and accuracy) information regarding different types of feedback. In addition, a preliminary analysis of the influence of the different types of feedback on brain activity is also shown. The result of this study may provide guidelines for the design of such a human-robot interaction system, depending on both the task and the human user.}
}

@inproceedings{apb-ucnlg-07,
title = {Evaluation of nlg: some analogies and differences with mt and reference resolution},
author = {Popescu-Belis, A. },
booktitle = {MT Summit XI Workshop on Using Corpora for NLG and MT (UCNLG MT)},
year = {2007},
pages = {66--68},
keywords = {Report_VII, IM2.DMA}
}

@article{cuendet07,
title = {Automatic Labeling Inconsistencies Detection And Correction For Sentence Unit Segmentation In Conversational Speech},
author = {Cuendet, S.  and Hakkani-Tur, D.  and Shriberg, E. },
journal = {to appear in Proceedings of MLMI, Brno, Czech Republic},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@article{knox07,
title = {Automatic Laughter Detection Using Neural Networks},
author = {Knox, M.  and Mirghafori, N. },
journal = {to appear in Proceedings of Interspeech, Antwerp.},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{tsamuel:interspeech-1:2008,
title = {Front-end for far-field speech recognition based on frequency domain linear prediction},
author = {Ganapathy, S.  and Thomas, A.  and Hermansky, H. },
crossref = {tsamuel:rr08-17},
booktitle = {Interspeech 2008},
year = {2008},
location = {Brisbane, Australia},
note = {IDIAP-RR 08-17},
keywords = {IM2.AP, Report_VII},
abstract = {Automatic Speech Recognition (ASR) systems usually fail when they encounter speech from far-field microphone in reverberant environments. This is due to the application of short-term feature extraction techniques which do not compensate for the artifacts introduced by long room impulse responses. In this paper, we propose a front-end, based on Frequency Domain Linear Prediction (FDLP), that tries to remove reverberation artifacts present in far-field speech. Long temporal segments of far-field speech are analyzed in narrow frequency sub-bands to extract FDLP envelopes and residual signals. Filtering the residual signals with gain normalized inverse FDLP filters result in a set of sub-band signals which are synthesized to reconstruct the signal back. ASR experiments on far-field speech data processed by the proposed front-end show significant improvements (relative reduction of $30 \%$ in word error rate) compared to other robust feature extraction techniques.}
}

@inproceedings{Zhao:ICSA:2008a,
title = {Getting the last laugh: automatic laughter segmentation in meetings},
author = {Knox, M.  and Morgan, N.  and Mirghafori, N. },
booktitle = {9th International Conference of the ISCA (Interspeech 2008), Brisbane, Australia},
year = {2008},
pages = {797--800},
keywords = {IM2.AP, Report_VIII}
}

@article{Dessimoz2006MBioIDFSI,
title = {Multimodal biometrics for identity documents (MBioID)},
author = {Dessimoz, D.  and Richiardi, J.  and Champod, C.  and Drygajlo, A. },
journal = {Forensic Science International},
booktitle = {Proc. 4th European Academy of Forensic Sciences Conference},
year = {2007},
volume = {167},
pages = {154--159},
doi = {10.1016/j.forsciint.2006.06.037},
keywords = {Report_VI, IM2.MPR},
owner = {Jori}
}

@article{marcel:ijivp:2007,
title = {On the recent use of local binary patterns for face authentication},
author = {Marcel, S.  and Rodriguez, Y.  and Heusch, G. },
crossref = {marcel:rr06-34},
journal = {International Journal on Image and Video Processing Special Issue on Facial Image Processing},
year = {2007},
note = {IDIAP-RR 06-34},
keywords = {IM2.VP, Report_VII},
abstract = {This paper presents a survey on the recent use of Local Binary Patterns (LBPs) for face recognition. LBP is becoming a popular technique for face representation. It is a non-parametric kernel which summarizes the local spacial structure of an image and it is invariant to monotonic gray-scale transformations. This is a very interesting property in face recognition. This probably explains the recent success of Local Binary Patterns in face recognition. In this paper, we describe the LBP technique and different approaches proposed in the literature to represent and to recognize faces. The most representatives are considered for experimental comparison on a common face authentication task. For that purpose, the XM2VTS and BANCA databases are used according to their respective experimental protocols.}
}

@inproceedings{Lalanne2007a,
title = {An ego-centric and tangible approach to meeting indexing and browsing},
author = {Lalanne, D.  and Ev\'equoz, F.  and Rigamonti, M.  and Dumas, B.  and Ingold, R. },
booktitle = {4th Joint Workshop on Multimodal Interaction and Related Machine Learning Algorithms (MLMI'07)},
year = {2007},
pages = {to appear},
keywords = {Report_VI, IM2.HMI}
}

@article{muller07,
title = {Combining Short-term Cepstral and Long-term Pitch Features for Automatic Recognition of Speaker Age},
author = {Müller, C.  and Burkhardt, F. },
journal = {to appear in Proceedings of Interspeech, Antwerp.},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{SMC2007,
title = {Valence-arousal evaluation using physiological signals in an emotion recall paradigm},
author = {Chanel, G.  and Ansari-Asl, K.  and Pun, T. },
booktitle = {2007 IEEE SMC, Int. Conf. on Systems, Man and Cybernetics, Smart cooperative systems and cybernetics: advancing knowledge and security for humanity},
year = {2007},
keywords = {Report_VI, IM2.MPR}
}

@article{KokFro-TMM.07,
title = {Semantic coding by supervised dimensionality reduction},
author = {Kokiopoulou, E.  and Frossard, P. },
journal = {IEEE Transactions on Multimedia},
year = {2008},
volume = {10},
number = {2},
keywords = {Report_VII, IM2.DMA.VP, joint}
}

@inproceedings{Vijayasenan_ICASSP2009_2009,
title = {MUTUAL INFORMATION BASED CHANNEL SELECTION FOR SPEAKER DIARIZATION OF MEETINGS DATA},
author = {Vijayasenan, D.  and Valente, F.  and Bourlard, H. },
booktitle = {Proceedings of International Conference on Acoustics, Speech and Signal Processing},
year = {2009},
keywords = {IM2.AP, Report_VIII},
abstract = {In the meeting case scenario, audio is often recorded using Multiple Distance Microphones (MDM) in a non-intrusive manner. Typically a beamforming is performed in order to obtain a single enhanced signal out of the multiple channels. This paper investigates the use of mutual information for selecting the channel subset that produces the lowest error in a diarization system. Conventional systems perform channel selection on the basis of signal properties such as SNR, cross correlation. In this paper, we propose the use of a mutual information measure that is directly related to the objective function of the diarization system. The proposed algorithms are evaluated on the NIST RT 06 eval dataset. Channel selection improves the speaker error by 1.1\% absolute (6.5\% relative) w.r.t. the use of all channels.},
projects = {Idiap,
AMIDA,
IM2},
}

@article{fleuret-2009,
title = {Multi-layer boosting for pattern recognition},
author = {Fleuret, F. },
journal = {Pattern Recognition Letters (PRL)},
year = {2009},
volume = {30},
pages = {237--241},
keywords = {IM2.VP, Report_VIII}
}

@article{laptev:cviu:2007,
title = {Local velocity-adapted motion events for spatio-temporal recognition},
author = {Laptev, I.  and Caputo, B.  and Lindberg, T. },
journal = {Computer Vision and Image Undertanding},
year = {2007},
volume = {108},
number = {3},
pages = {207--229},
issn = {1077-3142},
keywords = {IM2.VP, Report_VII},
abstract = {In this paper we address the problem in motion recognition using event-based local motion representations. We assume that similar patterns of motion contain similar events with consistent motion across image sequences. Using this assumption, we formulate the problem of motion recognition as a matching of corresponding events in image sequences. To enable the matching, we present and evaluate a set of motion descriptors exploiting the spatial and the temporal coherence of motion measurements between corresponding events in image sequences. As motion measurements may depend on the relative motion of the camera, we also present a mechanism for local velocity adaptation of events and evaluate its influence when recognizing image sequences subjected to different camera motions. When recognizing motion, we compare the performance of nearest neighbor (NN) classifier with the performance of support vector machine (SVM).We also compare event-based motion representations to motion representations by global histograms. An experimental evaluation on a large video database with human actions demonstrates the advantage of the proposed scheme for event-based motion representation in combination with SVM classification. The particular advantage of event-based representations and velocity adaptation is further emphasized when recognizing human actions in unconstrained scenes with complex and non-stationary backgrounds.}
}

@article{Dumas20089,
title = {D\'emonstration : hephaistk, une bo\^{\i}te \a outils pour le prototypage d'interfaces multimodales},
author = {Dumas, B.  and Lalanne, D.  and Ingold, R. },
year = {2008},
keywords = {Report_VII, IM2.HMI}
}

@inproceedings{kludas2008:gfkl,
title = {Exploiting synergistic and redundant features for multimedia document classification},
author = {Kludas, J.  and Bruno, E.  and Marchand-Maillet, S. },
booktitle = {32nd Annual Conference of the German Classification Society - Advances in Data Analysis, Data Handling and Business Intelligence (GfKl 2008)},
year = {2008},
url = {http://viper.unige.ch/documents/pdf/kludas2008-gfkl.pdf},
keywords = {IM2.MCA, Report_VIII}
}

@inproceedings{graves07unconstrained,
title = {Unconstrained on-line handwriting recognition with recurrent neural networks},
author = {Graves, A.  and Liwicki, M.  and Bunke, H. },
booktitle = {Advances in Neural Information Processing},
series = {NIPS},
year = {2007},
volume = {20},
keywords = {Report_VII, IM2.VP},
peer = {yes}
}

@inproceedings{ali-et-al-2009,
title = {Joint learning of pose estimators and features for object detection},
author = {Ali, K.  and Fleuret, F.  and Hasler, D.  and Fua, P. },
booktitle = {Proceedings of the IEEE International Conference on Computer Vision (ICCV)},
year = {2009},
note = {(to appear)},
keywords = {IM2.VP, Report_VIII}
}

@inproceedings{MMSPL-CONF-2008-002,
title = {Face Detection Using Discrete Gabor Jets And Color Information},
author = {Hoffmann, U.  and Naruniec, J.  and Yazdani, A.  and Ebrahimi, T. },
booktitle = {SIGMAP 2008 - International Conference on Signal Processing and Multimedia Applications},
year = {2008},
location = {Porto},
url = {http://www.sigmap.org/},
keywords = {Report_VII, IM2.VP,Face Detection; Colored Image Patch Model; Discrete Gabor Jets; Linear Discriminant Analysis},
abstract = {Face detection allows to recognize and detect human faces and provides information about their location in a given image. Many applications such as biometrics, face recognition, and video surveillance employ face detection as one of their main modules. Therefore, improvement in the performance of existing face detection systems and new achievements in this field of research are of significant importance. In this paper a hierarchical classification approach for face detection is presented. In the first step, discrete Gabor jets (DGJ) are used for extracting features related to the brightness information of images and a preliminary classification is made. Afterwards, a skin detection algorithm, based on modeling of colored image patches, is employed as a post-processing of the results of DGJ-based classification. It is shown that the use of color efficiently reduces the number of false positives while maintaining a high true positive rate. Finally, a comparison is made with the OpenCV implementation of the Viola and Jones face detector and it is concluded that higher correct classification rates can be attained using the proposed face detector.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/125104},
oai-id = {oai:infoscience.epfl.ch:125104},
oai-set = {conf},
review = {REVIEWED},
status = {ACCEPTED},
unit = {MMSPL}
}

@inproceedings{norman:icassp:2006,
title = {Using chimeric users to construct fusion classifiers in biometric authentication tasks: an investigation},
author = {Poh, N.  and Bengio, S. },
booktitle = {IEEE Int. Conf. on Acoustics, Speech, and Signal Processing (ICASSP)},
year = {2006},
note = {IDIAP-RR 05-59},
keywords = {Report_VI, IM2.MPR},
abstract = {Chimeric users have recently been proposed in the field of biometric person authentication as a way to overcome the problem of lack of real multimodal biometric databases as well as an important privacy issue -- the fact that too many biometric modalities of a same person stored in a single location can present a emphhigher risk of identity theft. While the privacy problem is indeed solved using chimeric users, it is still an open question of how such chimeric database can be efficiently used. For instance, the following two questions arise: i) Is the performance measured on a chimeric database a good predictor of that measured on a real-user database?, and, ii) can a chimeric database be exploited to emphimprove the generalization performance of a fusion operator on a real-user database?. Based on a considerable amount of empirical biometric person authentication experiments (21 real-user data sets and up to $21 times 1000$ chimeric data sets and two fusion operators), our previous study citePoh_05_chimeric answers bf no to the first question. The current study aims to answer the second question. Having tested on four classifiers and as many as 3380 face and speech bimodal fusion tasks (over 4 different protocols) on the BANCA database and four different fusion operators, this study shows that generating multiple chimeric databases emphdoes not degrade nor improve the performance of a fusion operator when tested on a real-user database with respect to using only a real-user database. Considering the possibly expensive cost involved in collecting the real-user multimodal data, our proposed approach is thus emphuseful to construct a trainable fusion classifier while at the same time being able to overcome the problem of small size training data.},
ipdmembership = {learning},
ipdxref = {techreport:norman-idiap-rr-05-59.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2006/norman-icassp-2006.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2006/norman-icassp-2006.ps.gz}
}

@inproceedings{Lefevre:ICME:2009,
title = {Structure and appearance features for robust 3d facial actions tracking},
author = {Lef\evre, S.  and Odobez, J. -M. },
booktitle = {International Conference on Multimedia and Expo (ICME)},
year = {2009},
keywords = {IM2.VP, Report_VIII}
}

@inproceedings{MMSPL-CONF-2008-007,
title = {A comparative study of color image compression standards using perceptually driven quality metrics},
author = {De Simone, F.  and Ticca, D.  and Dufaux, F.  and Ansorge, M.  and Ebrahimi, T. },
booktitle = {SPIE Optics and Photonics},
year = {2008},
location = {San Diego, CA USA},
keywords = {Report_VII, IM2.MCA, Image compression; codec performance; Full-Reference quality assessment; perceptual quality metrics},
abstract = {The task of comparing the performance of different codecs is strictly related to the research in the field of objective quality metrics. Even if several objective quality metrics have been proposed in literature, the lack of standardization in the field of objective quality assessment and the lack of extensive and reliable comparisons of the performance of the different state-of-the-art metrics often make the results obtained using objective metrics not very reliable. In this paper we aim at comparing the performance of three of the existing alternatives for compression of digital pictures, i.e. JPEG, JPEG 2000, and JPEG XR compression, by using different objective Full Reference metrics and considering also perceptual quality metrics which take into account the color information of the data under analysis.}
}

@inproceedings{MMSPL-CONF-2008-005,
title = {Towards Fully Automatic Image Segmentation Evaluation},
author = {Goldmann, L.  and Adamek, T.  and Vajda, P.  and Karaman, M.  and Mörzinger, R.  and Galmar, E.  and Sikora, T.  and O'Connor, N.  and Ha-Minh, T.  and Ebrahimi, T.  and Schallauer, P.  and Huet, B. },
booktitle = {Advanced Concepts for Intelligent Vision Systems (ACIVS)},
series = {Lecture Notes in Computer Science},
year = {2008},
publisher = {Springer},
location = {Juan-les-Pins},
url = {http://acivs.org/acivs2008/},
keywords = {Report_VII, IM2.MCA},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/125734},
oai-id = {oai:infoscience.epfl.ch:125734},
oai-set = {conf},
review = {REVIEWED},
status = {ACCEPTED},
unit = {MMSPL}
}

@inproceedings{MMSPL-CONF-2008-004,
title = {A search and retrieval framework for the management of copyrighted audiovisual content},
author = {Carreras, A.  and Cordara, G.  and Delgado, J.  and Dufaux, F.  and Francini, G.  and Ha, T. M.  and Rodriguez, E.  and Tous, R. },
booktitle = {50th International Symposium ELMAR 2008},
year = {2008},
url = {http://infoscience.epfl.ch/getfile.py?recid=125158&mode=best},
keywords = {Report_VII, IM2.MCA},
abstract = {This paper presents a search and retrieval framework that enables the management of Intellectual Property in the World Wide Web. This twofold framework helps users to detect digital rights infringements of their copyrighted content. In order to detect possible copyright infringments, first the system crawls the Web to search replicas of users images, and later evaluates if the copies have been taken respecting the terms stated by the owner. On the other hand, this framework also helps users in finding something interesting in the Web. It will provide copyrighted content to users according to their preferences and to intellectual property rights integrating search and retrieval with digital rights management tools.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/125158},
oai-id = {oai:infoscience.epfl.ch:125158},
oai-set = {conf; fulltext; fulltext},
review = {REVIEWED},
status = {PUBLISHED},
unit = {MMSPL}
}

@inproceedings{Yao_EUSIPCO_2008,
title = {Multi-camera 3d person tracking with particle filter in a surveillance environment},
author = {Yao, J.  and Odobez, J. -M. },
booktitle = {16th European Signal processing Conference (EUSIPCO)},
year = {2008},
keywords = {IM2.VP, Report_VIII},
abstract = {In this work we present and evaluate a novel 3D approach to track single people in surveillance scenarios, using multiple cameras. The problem is formulated in a Bayesian filtering framework, and solved through sampling approximations (i.e. using a particle filter). Rather than relying on a 2D state to represent people, as is most commonly done, we directly exploit 3D knowledge by tracking people in the 3D world. A novel dynamical model is presented that accurately models the coupling between people orientation and motion direction. In addition, people are represented by three 3D elliptic cylinders which allow to introduce a spatial color layout useful to discriminate the tracked person from potential distractors. Thanks to the particle filter approach, integrating background subtraction and color observations from multiple cameras is straightforward. Alltogether, the approach is quite robust to occlusion and large variations in people appearence, even when using a single camera, as demonstrated by numerical performance evaluation on real and challenging data from an underground station.},
projects = {Idiap,
CARETAKER},
}

@inbook{DominiqueBrodbeck20093,
title = {Interactive visualization - a survey},
author = {Brodbeck, D.  and Mazza, R.  and Lalanne, D. },
year = {\bibnodate},
keywords = {IM2.HMI, Report_VIII}
}

@techreport{haketa:rr08-39,
title = {Enhanced phone posteriors for improving speech recognition systems},
author = {Ketabdar, H.  and Bourlard, H. },
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-39-2008},
institution = {IDIAP},
keywords = {IM2.AP, Report_VII},
abstract = {Using phone posterior probabilities has been increasingly explored for improving automatic speech recognition (ASR) systems. In this paper, we propose two approaches for hierarchically enhancing these phone posteriors, by integrating long acoustic context, as well as prior phonetic and lexical knowledge. In the first approach, phone posteriors estimated with a Multi-Layer Perceptron (MLP), are used as emission probabilities in HMM forward-backward recursions. This yields new enhanced posterior estimates integrating HMM topological constraints (encoding specific phonetic and lexical knowledge), and context. posteriors are post-processed by a secondary MLP, in order to learn inter and intra dependencies between the phone posteriors. These dependencies are prior phonetic knowledge. The learned knowledge is integrated in the posterior estimation during the inference (forward pass) of the second MLP, resulting in enhanced phone posteriors. We investigate the use of the enhanced posteriors in hybrid HMM/ANN and Tandem configurations. We propose using the enhanced posteriors as replacement, or as complementary evidences to the regular MLP posteriors. The proposed method has been tested on different small and large vocabulary databases, always resulting in consistent improvements in frame, phone and word recognition rates.}
}

@phdthesis{ba-thesis-2007,
title = {Joint head tracking and pose estimation for visual focus of attention recognition},
author = {Ba, S. },
year = {2007},
school = {\'Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {Th\ese sciences Ecole polytechnique f\'ed\'erale de Lausanne EPFL, no 3764 (2007), Facult\'e des sciences et techniques de l'ing\'enieur STI, Section de g\'enie \'electrique et \'electronique, Institut de g\'enie \'electrique et \'electronique IEL (Laboratoire de l'IDIAP LIDIAP). Dir.: Herv\'e Bourlard, Jean-Marc Odobez},
keywords = {IM2.VP, Report_VI},
}

@article{Fleuret_JMLR_2008,
title = {Stationary features and cat detection},
author = {Fleuret, F.  and Geman, D. },
crossref = {fleuret-geman-rr2007},
journal = {Journal of Machine Learning Research},
year = {2008},
keywords = {IM2.VP, Report_VII},
abstract = {Most discriminative techniques for detecting instances from object categories in still images consist of looping over a partition of a pose space with dedicated binary classifiers. The efficiency of this strategy for a complex pose, i.e., for fine-grained descriptions, can be assessed by measuring the effect of sample size and pose resolution on accuracy and computation. Two conclusions emerge: i) fragmenting the training data, which is inevitable in dealing with high in-class variation, severely reduces accuracy; ii) the computational cost at high resolution is prohibitive due to visiting a massive pose partition. To overcome data-fragmentation we propose a novel framework centered on pose-indexed features which assign a response to a pair consisting of an image and a pose, and are designed to be stationary: the probability distribution of the response is always the same if an object is actually present. Such features allow for efficient, one-shot learning of pose-specific classifiers. To avoid expensive scene processing, we arrange these classifiers in a hierarchy based on nested partitions of the pose; as in previous work on coarse-to-fine search, this allows for efficient processing. The hierarchy is then "folded" for training: all the classifiers at each level are derived from one base predictor learned from all the data. The hierarchy is "unfolded" for testing: parsing a scene amounts to examining increasingly finer object descriptions only when there is sufficient evidence for coarser ones. In this way, the detection results are equivalent to an exhaustive search at high resolution. We illustrate these ideas by detecting and localizing cats in highly cluttered greyscale scenes.}
}

@inproceedings{Galbally_BIDS_2009,
title = {Hill-Climbing Attack to an Eigenface-Based Face Verification System},
author = {Galbally, J.  and McCool, C.  and Fierrez, J.  and Marcel, S.  and Ortega-Garcia, J. },
booktitle = {Proceedings of the First IEEE International Conference on Biometrics, Identity and Security (BIdS)},
year = {2009},
keywords = {IM2.VP, Report_VIII},
abstract = {We use a general hill-climbing attack algorithm based on Bayesian adaption to test the vulnerability of an Eigenface-based approach for face recognition against indirect attacks. The attacking technique uses the scores provided by the matcher to adapt a global distribution, computed from a development set of users, to the local specificities of the client being attacked. The proposed attack is evaluated on an Eigenfacebased verification system using the XM2VTS database. The results show a very high efficiency of the hill-climbing algorithm, which successfully bypassed the system for over 85\% of the attacked accounts.},
projects = {Idiap,
MOBIO},
}

@article{mariethoz:pattern:2006,
title = {A kernel trick for sequences applied to text-independent speaker verification systems},
author = {Mari\'ethoz, J.  and Bengio, S. },
crossref = {mariethoz:rr05-77},
journal = {Pattern Recognition},
year = {2007},
volume = {40},
number = {8},
issn = {0031-3203},
note = {IDIAP-RR 05-77},
keywords = {IM2.AP, Report_VI},
abstract = {This paper present a principled SVM based speaker verification system. We propose a new framework and a new sequence kernel that can make use of any Mercer kernel at the frame level. An extension of the sequence kernel based on the Max operator is also proposed. The new system is compared to state-of-the-art GMM and other SVM based systems found in the literature on the Banca and Polyvar databases. The new system outperforms, most of the time, the other systems, statistically significantly. Finally, the new proposed framework clarifies previous SVM based systems and suggests interesting future research directions.}
}

@book{camvin08,
title = {Machine learning for audio, image and video analysis},
author = {Camastra, F.  and Vinciarelli, A. },
booktitle = {Machine learning for audio, image and video analysis},
series = {Advanced Information and Knowledge Processing},
year = {2008},
volume = {XVI},
number = {496 p.},
edition = {Theory and Applications},
publisher = {Springer Verlag},
isbn = {978-1-84800-006-3},
keywords = {IM2.MCA, Report_VII},
abstract = {Machine Learning involves several scientific domains including mathematics, computer science, statistics and biology, and is an approach that enables computers to automatically learn from data. Focusing on complex media and how to convert raw data into useful information, this book offers both introductory and advanced material in the combined fields of machine learning and image/video processing. The machine learning techniques presented enable readers to address many real world problems involving complex data. Examples covering areas such as automatic speech and handwriting transcription, automatic face recognition, and semantic video segmentation are included, along with detailed introductions to algorithms and examples of their applications. The book is organized in four parts: The first focuses on technical aspects, basic mathematical notions and elementary machine learning techniques. The second provides an extensive survey of most relevant machine learning techniques for media processing, while the third part focuses on applications and shows how techniques are applied in actual problems. The fourth part contains detailed appendices that provide notions about the main mathematical instruments used throughout the text. Students and researchers needing a solid foundation or reference, and practitioners interested in discovering more about the state-of-the-art will find this book invaluable. Examples and problems are based on data and software packages publicly available on the web.}
}

@inproceedings{elhannani06:odis,
title = {Using data-driven and phonetic units for speaker verification},
author = {Hannani, A.  and Toledano, D.  and Petrovska, D.  and Montero-Asenjo, A.  and Hennebert, J. },
booktitle = {IEEE Speaker and Language Recognition Workshop (Odyssey 2006), Puerto Rico},
year = {2006},
keywords = {Report_VI, IM2.MPR}
}

@article{magimai07,
title = {Entropy Based Classifier Combination for Sentence Segmentation,},
author = {Magimai-Doss, M.  and Hakkani-Tur, D.  and Cetin, O.  and Shriberg, E.  and Fung, J.  and Mirghafori, N. },
journal = {Proc. ICASSP, Honolulu},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@article{jaimes2007,
title = {Guest Editors' Introduction: Human-Centered Computing-Toward a Human Revolution},
author = {Jaimes, A.  and Gatica-Perez, D.  and Sebe, N.  and Huang, T. S. },
journal = {Computer},
year = {2007},
volume = {40},
number = {5},
pages = {30--34},
keywords = {Report_VI, IM2.HMI, hci}
}

@article{kamangar08,
title = {An iterative unsupervised learning method for information distillation},
author = {Kamangar, K.  and Hakkani-Tur, D.  and Tur, G.  and Levit, M. },
journal = {accepted for IEEE ICASSP, Las Vegas, NV},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@inproceedings{farrahi:mlmi:2008,
title = {Daily routine classification from mobile phone data},
author = {Gatica-Perez, D.  and Farrahi, K. },
crossref = {farrahi:rr07-62},
booktitle = {Workshop on Machine Learning and Multimodal Interaction (MLMI08)},
year = {2008},
location = {Utrecht, The Netherlands},
note = {IDIAP-RR 07-62},
keywords = {IM2.MCA, Report_VII},
abstract = {The automatic analysis of real-life, long-term behavior and dynamics of individuals and groups from mobile sensor data constitutes an emerging and challenging domain. We present a framework to classify people's daily routines (defined by day type, and by group affiliation type) from real-life data collected with mobile phones, which include physical location information (derived from cell tower connectivity), and social context (given by person proximity information derived from Bluetooth). We propose and compare single- and multi-modal routine representations at multiple time scales, each capable of highlighting different features from the data, to determine which best characterized the underlying structure of the daily routines. Using a massive data set of 87000 hours spanning four months of the life of 30 university students, we show that the integration of location and social context and the use of multiple time-scales used in our method is effective, producing accuracies of over 80\% for the two daily routine classification tasks investigated, with significant performance differences with respect to the single-modal cues.}
}

@inproceedings{voloshynovskiy3,
title = {Random projections based item authentication},
author = {Voloshynovskiy, S.  and Koval, O.  and Beekhof, F.  and Pun, T. },
booktitle = {Electronic Imaging 2009},
year = {2009},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{GaticaBook09,
title = {Modeling interest in face-to-face conversations from multimodal nonverbal behavior},
author = {Gatica-Perez, D. },
booktitle = {In J.-P. Thiran, H. Bourlard, and F. Marques, (Eds.), Multimodal Signal Processing, Academic Press, in press},
year = {\bibnodate},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{RichDrygTodes1,
title = {Promoting diversity in gaussian mixture ensembles: an application to signature verification},
author = {Richiardi, J.  and Drygajlo, A.  and Todesco, L. },
booktitle = {Biometrics and Identity Management, Lecture Notes in Computer Science 5372},
year = {2008},
pages = {140--149},
keywords = {IM2.MPR, Report_VIII}
}

@incollection{millan:2006:yearbook,
title = {Non-invasive brain-actuated control of a mobile robot by human eeg},
author = {del R. Millán, J.  and Renkens, F.  and Mouri\~no, J.  and Gerstner, W. },
booktitle = {2006 IMIA Yearbook of Medical Informatics},
year = {2006},
publisher = {Schattauer Verlag},
keywords = {Report_VI, IM2.BMI, major},
abstract = {Brain activity recorded non-invasively is sufficient to control a mobile robot if advanced robotics is used in combination with asynchronous EEG analysis and machine learning techniques. Until now brain-actuated control has mainly relied on implanted electrodes, since EEG-based systems have been considered too slow for controlling rapid and complex sequences of movements. We show that two human subjects successfully moved a robot between several rooms by mental control only, using an EEG-based brain-machine interface that recognized three mental states. Mental control was comparable to manual control on the same task with a performance ratio of 0.74.},
ipdmembership = {learning}
}

@inproceedings{MMSPL-CONF-2009-008,
title = {Towards Generic Detection of Unusual Events in Video Surveillance},
author = {Ivanov, I.  and Dufaux, F.  and Ha, T. M.  and Ebrahimi, T. },
booktitle = {6th IEEE International Conference on Advanced Video and Signal Based Surveillance (AVSS\^a09)},
year = {2009},
location = {Genoa, Italy},
url = {http://www.avss09.org/},
keywords = {Unusual event; Trajectory representation; Feature extraction; Support Vector Machine classifier, IM2.MCA, Report_VIII},
abstract = {In this paper, we consider the challenging problem of unusual event detection in video surveillance systems. The proposed approach makes a step toward generic and automatic detection of unusual events in terms of velocity and acceleration. At first, the moving objects in the scene are detected and tracked. A better representation of moving objects trajectories is then achieved by means of appropriate pre-processing techniques. A supervised Support Vector Machine method is then used to train the system with one or more typical sequences, and the resulting model is then used for testing the proposed method with other typical sequences (different scenes and scenarios). Experimental results are shown to be promising. The presented approach is capable of determining similar unusual events as in the training sequences.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/138573},
oai-id = {oai:infoscience.epfl.ch:138573},
oai-set = {conf},
review = {REVIEWED},
status = {ACCEPTED},
unit = {MMSPL}
}

@inproceedings{Kaufmann:07,
title = {An HPSG parser supporting discontinuous licenser rules},
author = {Kaufmann, T.  and Pfister, B. },
booktitle = {International Conference on HPSG},
year = {2007},
note = {(to appear)},
keywords = {Report_VI, IM2.AP}
}

@article{Evequoz200710,
title = {Personal information management through interactive visualizations},
author = {Ev\'equoz, F.  and Lalanne, D. },
year = {2007},
pages = {158--160},
keywords = {Report_VII, IM2.HMI}
}

@article{millan:2008:ieee-is,
title = {Brain-Controlled Robots},
author = {Millán, J. del R. },
journal = {IEEE Intelligent Systems},
year = {2008},
keywords = {IM2.BMI, Report_VIII},
abstract = {The idea of moving robots or prosthetic devices not by manual control, but by mere ü{\i} 1/2thinkingü{\i} 1/2 (i.e., the brain activity of human subjects) has fascinated researchers for the last 30 years, but it is only now that first experiments have shown the possibility to do so. How can brainwaves be used to directly control robots? Most of the hope for braincontrolled robots comes from invasive approaches that provide detailed single neuron activity recorded from microelectrodes implanted in the brain [1]. The motivation for these invasive approaches is that it has been widely shown that motor parameters related to hand and arm movements are encoded in a distributed and redundant way by ensembles of neurons in the motor system of the brainü{\i} 1/2motor, premotor and posterior parietal cortex. For humans, however, it is preferable to use non-invasive approaches to avoid health risks and the associated ethical concerns. Most non-invasive brain-computer interfaces (BCI) use electroencephalogram (EEG) signals; i.e., the electrical brain activity recorded from electrodes placed on the scalp. The main source of the EEG is the synchronous activity of thousands of cortical neurons. Thus, EEG signals suffer from a reduced spatial resolution and increased noise due to measurements on the scalp. As a consequence, current EEG-based brain-actuated devices are limited by a low channel capacity and are considered too slow for controlling rapid and complex sequences of robot movements. But, recently, we have shown for the first time that online analysis of EEG signals, if used in combination with advanced robotics and machine learning techniques, is sufficient for humans to continuously control a mobile robot [2] and a wheelchair [3]. In this article we will review our work on non-invasive brain-controlled robots and discuss some of the challenges ahead.},
ipdmembership = {learning},
projects = {Idiap},
pdf = {ftp://ftp.idiap.ch/pub/reports/2008/millan_2008_is.pdf}
}

@techreport{pgarner:rr08-03,
title = {A weighted finite state transducer tutorial},
author = {Garner, P. N. },
year = {2008},
type = {Idiap-Com},
number = {Idiap-Com-03-2008},
institution = {IDIAP},
keywords = {IM2.AP, Report_VII},
abstract = {The concepts of WFSTs are summarised, including structural and stochastic optimisations. A typical composition process for ASR is described. Some experiments show that care should be taken with silence models.}
}

@inproceedings{eth_biwi_00483,
title = {Dynamic 3d scene analysis from a moving vehicle},
author = {Leibe, B.  and Cornelis, N.  and Cornelis, K.  and van Gool, L. },
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR'07)},
year = {2007},
keywords = {Report_VI, IM2.VP, major publication, Best Paper Awards}
}

@inproceedings{sba:mlmi:2006,
title = {A study on visual focus of attention recognition from head pose in a meeting room},
author = {Ba, S.  and Odobez, J. -M. },
booktitle = {3rd Joint Workshop on Multimodal Interaction and Related Machine Learning Algorithms (MLMI06)},
year = {2006},
note = {IDIAP-RR 06-10},
keywords = {Report_VI, IM2.VP.MPR, Joint publication},
abstract = {This paper presents a study on the recognition of the visual focus of attention (VFOA) of meeting participants based on their head pose. Contrarily to previous studies on the topic, in our set-up, the potential VFOA of people is not restricted to other meeting participants only, but includes environmental targets (table, slide screen). This has two consequences. Firstly, this increases the number of possible ambiguities in identifying the VFOA from the head pose. Secondly, due to our particular set-up, the identification of the VFOA from head pose can not rely on an incomplete representation of the pose (the pan), but requests the knowledge of the full head pointing information (pan and tilt). In this paper, using a corpus of 8 meetings of 8 minutes on average, featuring 4 persons involved in the discussion of statements projected on a slide screen, we analyze the above issues by evaluating, through numerical performance measures, the recognition of the VFOA from head pose information obtained either using a magnetic sensor device (the ground truth) or a vision based tracking system (head pose estimates). The results clearly show that in complex but realistic situations, it is quite optimistic to believe that the recognition of the VFOA can solely be based on the head pose, as some previous studies had suggested.},
ipdmembership = {vision},
ipdxref = {techreport:sba-idiap-rr-06-10.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2006/sba-mlmi-2006.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2006/sba-mlmi-2006.ps.gz}
}

@techreport{smith:rr06-40,
title = {Tracking attention for multiple people: wandering visual focus of attention estimation},
author = {Smith, K.  and Ba, S.  and Odobez, J. -M.  and Gatica-Perez, D. },
year = {2006},
type = {IDIAP-RR},
number = {40},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {Report_VI, IM2.VP},
abstract = {The problem of finding the visual focus of attention of multiple people free to move in an unconstrained manner is defined here as the em wandering visual focus of attention (WVFOA) problem. Estimating the WVFOA for multiple unconstrained people is a new and important problem with implications for human behavior understanding and cognitive science, as well as real-world applications. One such application, which we present in this article, monitors the attention passers-by pay to an outdoor advertisement. In our approach to the WVFOA problem, we propose a multi-person tracking solution based on a hybrid Dynamic Bayesian Network that simultaneously infers the number of people in a scene, their body locations, their head locations, and their head pose. It is defined in a joint state-space formulation that allows for the modeling of interactions between people. For inference in the resulting high-dimensional state-space, we propose a trans-dimensional Markov Chain Monte Carlo (MCMC) sampling scheme, which not only handles a varying number of people, but also efficiently searches the state-space by allowing person-part state updates. Our model was rigorously evaluated for tracking quality and ability to recognize people looking at an outdoor advertisement, and the results indicate good performance for these tasks.},
ipdmembership = {vision},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/smith-idiap-rr-06-40.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/smith-idiap-rr-06-40.ps.gz}
}

@inproceedings{Drygajlo1,
title = {Improving biometric verification with class-independent quality information},
author = {Kryszczuk, K.  and Drygajlo, A. },
journal = {IET Signal Processing, Special Issue on Biometric Recognition},
year = {2009},
volume = {3},
pages = {310--321},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{eth_biwi_00489,
title = {Fast 3d scanning with automatic motion compensation},
author = {Weise, T.  and Leibe, B.  and van Gool, L. },
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR'07)},
year = {2007},
keywords = {Report_VI, IM2.VP}
}

@inproceedings{eth_biwi_00521,
title = {Object recognition for the internet of things},
author = {Quack, T.  and Bay, H.  and van Gool, L. },
booktitle = {Internet of Things 2008},
year = {2008},
note = {in press},
keywords = {Report_VII, IM2.MCA}
}

@article{eth_biwi_00520,
title = {Privacy in video surveilled spaces},
author = {Spindler, T.  and Wartmann, C.  and Hovestadt, L.  and Roth, D.  and van Gool, L.  and Steffen, A. },
journal = {Journal of Computer Security},
year = {2008},
volume = {16},
number = {2},
pages = {199--222},
keywords = {Report_VII, IM2.VP, Surveillance, cryptography, computer vision, building automation}
}

@inproceedings{eth_biwi_00527,
title = {Fast body posture estimation using volumetric features},
author = {van den Berg, M.  and Koller-Meier, E.  and van Gool, L. },
booktitle = {IEEE Visual Motion Computing (MOTION)},
year = {2008},
keywords = {Report_VII, IM2.VP, Haarlets, LDA, pose estimation, 3D, hull reconstruction}
}

@inproceedings{LTS-CONF-2007-028,
title = {Low-Dimensional Motion Features for Audio-Visual Speech Recognition},
author = {Gurban, M.  and Valles, A.  and Thiran, J. -Ph. },
booktitle = {15th European Signal Processing Conference (EUSIPCO), Poznan, Poland},
year = {2007},
location = {Poznan, Poland},
url = {http://www.eusipco2007.org/},
keywords = {Report_VI, LTS5; IM2.MPR},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/search.py?recid=109488},
oai-id = {oai:infoscience.epfl.ch:109488},
oai-set = {conf},
review = {REVIEWED},
status = {PUBLISHED},
unit = {LTS}
}

@inproceedings{perrin:fsr:2009,
title = {Learning to identify users and predict their destination in a robotic guidance application},
author = {Perrin, X.  and Colas, F.  and Pradalier, C.  and Siegwart, R. },
booktitle = {Field and Service Robotics (FSR)},
year = {2009},
keywords = {IM2.BMI, Report_VIII}
}

@article{Behera2007,
title = {Docmir: an automatic document-based indexing system for meeting retrieval},
author = {Behera, A.  and Lalanne, D.  and Ingold, R. },
journal = {Multimedia Tools and Applications},
year = {2007},
volume = {37},
number = {2},
keywords = {Report_VI, IM2.DMA}
}

@inproceedings{smith:MLMI:2006,
title = {Multi-person tracking in meetings: a comparative study},
author = {Smith, K.  and Schreiber, S.  and Beran, V.  and Pot\'ucek, I.  and Rigoll, G.  and Gatica-Perez, D. },
booktitle = {Multimodal Interaction and Related Machine Learning Algorithms (MLMI)},
year = {2006},
note = {IDIAP-RR 06-38},
keywords = {Report_VI, IM2.MPR},
abstract = {In this paper, we present the findings of the Augmented Multiparty Interaction (AMI) project investigation on the localization and tracking of 2D head positions in meetings. The focus of the study was to test and evaluate various multi-person tracking methods developed in the project using a standardized data set and evaluation methodology.},
ipdmembership = {vision},
ipdxref = {techreport:smith-idiap-rr-06-38.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2006/smith-MLMI-2006.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2006/smith-MLMI-2006.ps.gz}
}

@inproceedings{Kumatani2008c,
title = {Filter bank design based on minimization of individual aliasing terms for minimum mutual information subband adaptive beamforming},
author = {Kumatani, K.  and McDonough, J.  and Schacht, S.  and Klakow, D.  and Garner, P. N.  and Li, W. },
booktitle = {International Conferance on Acoustics Speech and Signal Processing},
year = {2008},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@inproceedings{Kumatani2008b,
title = {Adaptive beamforming with a maximum negentropy criterion,},
author = {Kumatani, K.  and McDonough, J.  and Klakow, D.  and Garner, P. N.  and Li, W. },
booktitle = {The Joint Workshop on Hands-free Speech Communication and Microphone Arrays},
year = {2008},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@article{Keshet_SPEECHCOMM_2009,
title = {Discriminative Keyword Spotting},
author = {Keshet, J.  and Grangier, D.  and Bengio, S. },
journal = {Speech Communication},
year = {2009},
volume = {51},
number = {4},
pages = {317--329},
keywords = {IM2.AP, Report_VIII},
projects = {Idiap},
}

@article{cetin07,
title = {An Articulatory Feature-based Tandem Approach and Factored Observation Modeling},
author = {Cetin, O.  and Kantor, A.  and King, S.  and Bartels, C.  and Magimai-Doss, M.  and Frankel, J.  and Livescu, K. },
journal = {Proc. ICASSP, Honolulu},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@article{Humm20084,
title = {Spoken signature for user authentication},
author = {Humm, A.  and Hennebert, J.  and Ingold, R. },
journal = {SPIE Journal of Electronic Imaging},
year = {2008},
volume = {17},
keywords = {Report_VII, IM2.HMI}
}

@article{Humm20085,
title = {Combined handwriting and speech modalities for user authentication},
author = {Humm, A.  and Hennebert, J.  and Ingold, R. },
journal = {IEEE Transactions on Systems, Man, and Cybernetics, Part A: Systems and Humans},
year = {2008},
volume = {38},
keywords = {Report_VII, IM2.HMI}
}

@inproceedings{heusch:ICB:2007,
title = {Face authentication with salient local features and static bayesian network},
author = {Heusch, G.  and Marcel, S. },
booktitle = {IEEE / IAPR Intl. Conf. On Biometrics (ICB)},
year = {2007},
note = {IDIAP-RR 07-04},
keywords = {Report_VI, IM2.VP},
abstract = {In this paper, the problem of face authentication using salient facial features together with statistical generative models is adressed. Actually, classical generative models, and Gaussian Mixture Models in particular make strong assumptions on the way observations derived from face images are generated. Indeed, systems proposed so far consider that local observations are independent, which is obviously not the case in a face. Hence, we propose a new generative model based on Bayesian Networks using only salient facial features. We compare it to Gaussian Mixture Models using the same set of observations. Conducted experiments on the BANCA database show that our model is suitable for the face authentication task, since it outperforms not only Gaussian Mixture Models, but also classical appearance-based methods, such as Eigenfaces and Fisherfaces.},
ipdmembership = {vision},
ipdxref = {techreport:heusch-idiap-rr-07-04.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/heusch-ICB-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/heusch-ICB-2007.ps.gz}
}

@article{garg08,
title = {Speaker role detection in meetings using lexical information and social network analysis},
author = {Garg, N.  and Hakkani-Tur, D. },
journal = {Technical Report TR-08-004, International Computer Science Institute, Berkeley, CA},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@inproceedings{millan:2007:icorr,
title = {Adaptive shared control of a brain-actuated simulated wheelchair},
author = {Philips, J.  and Millán, J. del R.  and Vanacker, G.  and Lew, E.  and Galán, F.  and Ferrez, P. W.  and van Brussel, H.  and Nuttin, M. },
booktitle = {Proceedings of the 10th IEEE International Conference on Rehabilitation Robotics},
year = {2007},
pages = {408--414},
isbn = {978-1-4244-1320-1},
doi = {10.1109/icorr.2007.4428457},
keywords = {IM2.BCI, Report_VI},
abstract = {The use of shared control techniques has a profound impact on the performance of a robotic assistant controlled by human brain signals. However, this shared control usually provides assistance to the user in a constant and identical manner each time. Creating an adaptive level of assistance, thereby complementing the user's capabilities at any moment, would be more appropriate. The better the user can do by himself, the less assistance he receives from the shared control system; and vice versa. In order to do this, we need to be able to detect when and in what way the user needs assistance. An appropriate assisting behaviour would then be activated for the time the user requires help, thereby adapting the level of assistance to the specific situation. This paper presents such a system, helping a brain-computer interface (BCI) subject perform goal-directed navigation of a simulated wheelchair in an adaptive manner. Whenever the subject has more difficulties in driving the wheelchair, more assistance will be given. Experimental results of two subjects show that this adaptive shared control increases the task performance. Also, it shows that a subject with a lower BCI performance has more need for extra assistance in difficult situations, such as manoeuvring in a narrow corridor.}
}

@inproceedings{Kludas2008:mmiu,
title = {Can feature information interaction help for information fusion in multimedia problems?},
author = {Kludas, J.  and Bruno, E.  and Marchand-Maillet, S. },
booktitle = {First International Workshop on Metadata Mining for Image Understanding},
year = {2008},
pages = {23--33},
keywords = {Report_VII, IM2.MCA}
}

@inproceedings{motlicek:TSD:2007,
title = {Non-uniform speech/audio coding exploiting predictability of temporal evolution of spectral envelopes},
author = {Motlicek, P.  and Hermansky, H.  and Ganapathy, S.  and Garudadri, H. },
crossref = {motlicek:rr06-30},
booktitle = {Tenth International Conference on TEXT, SPEECH and DIALOGUE (TSD)},
year = {2007},
pages = {350--357},
issn = {0302-9743},
note = {IDIAP-RR 06-30},
keywords = {IM2.AP, Report_VII},
abstract = {Unlike classical state-of-the-art coders that are based on short-term spectra, our approach uses relatively long temporal segments of audio signal in critical-band-sized sub-bands. We apply auto-regressive model to approximate Hilbert envelopes in frequency sub-bands. Residual signals (Hilbert carriers) are demodulated and thresholding functions are applied in spectral domain. The Hilbert envelopes and carriers are quantized and transmitted to the decoder. Our experiments focused on designing speech/audio coder to provide broadcast radio-like quality audio around 15-25kbps. Obtained objective quality measures, carried out on standard speech recordings, were compared to the state-of-the-art 3GPP-AMR speech coding system.}
}

@inproceedings{ganapathy:aes:2008,
title = {Autoregressive modelling of hilbert envelopes for wide-band audio coding},
author = {Ganapathy, S.  and Motlicek, P.  and Hermansky, H.  and Garudadri, H. },
crossref = {ganapathy:rr08-40},
booktitle = {AES 124th Convention, Audio Engineering Society},
year = {2008},
location = {Amsterdam},
note = {IDIAP-RR 08-40},
keywords = {IM2.AP, Report_VII},
abstract = {Frequency Domain Linear Prediction (FDLP) represents the technique for approximating temporal envelopes of a signal using autoregressive models. In this paper, we propose a wide-band audio coding system exploiting FDLP. Specifically, FDLP is applied on critically sampled sub-bands to model the Hilbert envelopes. The residual of the linear prediction forms the Hilbert carrier, which is transmitted along with the envelope parameters. This process is reversed at the decoder to reconstruct the signal. In the objective and subjective quality evaluations, the FDLP based audio codec at $66$ kbps provides competitive results compared to the state-of-art codecs at similar bit-rates.}
}

@inproceedings{Gerber:07,
title = {Perceptron-based class verification},
author = {Gerber, M.  and Kaufmann, T.  and Pfister, B. },
booktitle = {Proceedings of NOLISP (ISCA Workshop on non linear speech processing)},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@techreport{marcel:rr06-47,
title = {Robust-to-illumination face localisation using active shape models and local binary patterns},
author = {Marcel, S.  and Keomany, J.  and Rodriguez, Y. },
year = {2006},
type = {IDIAP-RR},
number = {47},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {Report_VI, IM2.VP},
abstract = {This paper addresses the problem of locating facial features in images of frontal faces taken under different lighting conditions. The well-known Active Shape Model method proposed by Cootes it et al. is extended to improve its robustness to illumination changes. For that purpose, we introduce the use of Local Binary Patterns (LBP). Three different incremental approaches combining ASM with LBP are presented: profile-based LBP-ASM, square-based LBP-ASM and divided-square-based LBP-ASM. Experiments performed on the standard and darkened image sets of the XM2VTS database demonstrate that the divided-square-based LBP-ASM gives superior performance compared to the state-of-the-art ASM. It achieves more accurate results and fails less frequently.},
ipdmembership = {vision},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/marcel-idiap-rr-06-47.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/marcel-idiap-rr-06-47.ps.gz}
}

@inproceedings{voloshynovskiy43,
title = {Multimodal authentication based on random projections and distributed coding},
author = {Voloshynovskiy, S.  and Koval, O.  and Beekhof, F.  and Pun, T. },
booktitle = {MM&Sec 2008},
year = {2008},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{Grandvalet_icml_2008,
title = {Composite Kernel Learning},
author = {Szafranski, M.  and Grandvalet, Y.  and Rakotomamonjy, A. },
editor = {McCallum, A.  and Roweis, S. },
crossref = {Grandvalet_Idiap-RR-59-2008},
booktitle = {Proceedings of the 25th Annual International Conference on Machine Learning (ICML 2008)},
year = {2008},
pages = {1040--1047},
publisher = {Omnipress},
note = {IDIAP-RR 08-59},
keywords = {IM2.MPR, Report_VIII},
abstract = {The Support Vector Machine (SVM) is an acknowledged powerful tool for building classifiers, but it lacks flexibility, in the sense that the kernel is chosen prior to learning. Multiple Kernel Learning (MKL) enables to learn the kernel, from an ensemble of basis kernels, whose combination is optimized in the learning process. Here, we propose Composite Kernel Learning to address the situation where distinct components give rise to a group structure among kernels. Our formulation of the learning problem encompasses several setups, putting more or less emphasis on the group structure. We characterize the convexity of the learning problem, and provide a general wrapper algorithm for computing solutions. Finally, we illustrate the behavior of our method on multi-channel data where groups correpond to channels.},
ipdmembership = {learning},
projects = {Idiap},
}

@inbook{ElenaMugellini20093,
title = {Memodules as tangible shortcuts to multimedia information},
author = {Mugellini, E.  and Lalanne, D.  and Dumas, B.  and Ev\'equoz, F.  and Gerardi, S.  and Le Calv\'e, A.  and Boder, A.  and Ingold, R.  and Khaled, O. },
year = {\bibnodate},
keywords = {IM2.HMI, Report_VIII}
}

@article{zheng07,
title = {Combining Discriminative Feature, Transform, and Model Training for Large Vocabulary Speech Recognition},
author = {Zheng, J.  and Cetin, O.  and Hwang, M. -Y.  and Lei, X.  and Stolcke, A.  and Morgan, N. },
journal = {Proc. ICASSP, Honolulu.},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{eth_biwi_00559,
title = {A generalization of the icp algorithm for articulated bodies},
author = {Pellegrini, S.  and Schindler, K.  and {D. Nardi} },
editor = {Everingham, M.  and Needham, C. },
booktitle = {British Machine Vision Conference (BMVC'08)},
year = {2008},
keywords = {Report_VII, IM2.VP}
}

@article{eth_biwi_00556,
title = {Coupled object detection and tracking from static cameras and moving vehicles},
author = {Leibe, B.  and Schindler, K.  and Cornelis, N.  and van Gool, L. },
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
year = {2008},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{eth_biwi_00557,
title = {World-scale mining of objects and events from community photo collections},
author = {Quack, T.  and Leibe, B.  and van Gool, L. },
booktitle = {Conference on Image and Video Retrieval (CIVR'08)},
year = {2008},
publisher = {ACM},
keywords = {Report_VII, IM2.MCA}
}

@inproceedings{eth_biwi_00555,
title = {Articulated multibody tracking under egomotion},
author = {Gammeter, S.  and Ess, A.  and Jaeggli, T.  and Leibe, B.  and Schindler, K.  and van Gool, L. },
booktitle = {European Conference on Computer Vision (ECCV'08)},
series = {LNCS},
year = {2008},
publisher = {Springer},
note = {in press},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{eth_biwi_00553,
title = {Multi-label image segmentation via point-wise repetition},
author = {Zeng, G.  and van Gool, L. },
booktitle = {International Conference on Computer Vision and Pattern Recognition (CVPR)},
year = {2008},
keywords = {Report_VII, IM2.VP}
}

@techreport{le:rr07-02,
title = {Dynamical dirichlet mixture model},
author = {Chen, L.  and Barber, D.  and Odobez, J. -M. },
year = {2007},
type = {IDIAP-RR},
number = {02},
institution = {IDIAP},
keywords = {Report_VI, IM2.MPR},
abstract = {In this report, we propose a statistical model to deal with the discrete-distribution data varying over time. The proposed model -- HMM DM -- extends the Dirichlet mixture model to the dynamic case: Hidden Markov Model with Dirichlet mixture output. Both the inference and parameter estimation procedures are proposed. Experiments on the generated data verify the proposed algorithms. Finally, we discuss the potential applications of the current model.},
ipdmembership = {vision and learning},
pdf = {ftp://ftp.idiap.ch/pub/reports/2007/le-idiap-rr-07-02.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2007/le-idiap-rr-07-02.ps.gz}
}

@article{Kumatani_ASLP_2009,
title = {Beamforming with a Maximum Negentropy Criterion},
author = {Kumatani, K.  and McDonough, J.  and Rauch, B.  and Klakow, D.  and Garner, P. N.  and Li, W. },
crossref = {kumatani:rr08-29},
journal = {IEEE Transactions on Audio Speech and Language Processing},
year = {2008},
volume = {17},
number = {5},
pages = {994--1008},
keywords = {IM2.AP, Report_VIII},
abstract = {In this paper, we address a beamforming application based on the capture of far-field speech data from a single speaker in a real meeting room. After the position of the speaker is estimated by a speaker tracking system, we construct a subband-domain beamformer in generalized sidelobe canceller (GSC) configuration. In contrast to conventional practice, we then optimize the active weight vectors of the GSC so as to obtain an output signal with maximum negentropy (MN). This implies the beamformer output should be as non-Gaussian as possible. For calculating negentropy, we consider the and the generalized Gaussian (GG) pdfs. After MN beamforming, Zelinski post- filtering is performed to further enhance the speech by remov- ing residual noise. Our beamforming algorithm can suppress noise and reverberation without the signal cancellation problems encountered in the conventional beamforming algorithms. We demonstrate this fact through a set of acoustic simulations. More- over, we show the effectiveness of our proposed technique through a series of far-field automatic speech recognition experiments on the Multi-Channel Wall Street Journal Audio Visual Corpus (MC- WSJ-AV), a corpus of data captured with real far-field sensors, in a realistic acoustic environment, and spoken by real speakers. On the MC-WSJ-AV evaluation data, the delay-and-sum beamformer with post-filtering achieved a word error rate (WER) of 16.5\%. MN beamforming with the pdf achieved a 15.8\% WER, which was further reduced to 13.2\% with the GG pdf, whereas the simple delay-and-sum beamformer provided a WER of 17.8\%. To the best of our knowledge, no lower error rates at present have been reported in the literature on this ASR task.},
projects = {AMIDA},
}

@inproceedings{luo:icvs:2008,
title = {Object category detection using audio-visual cues},
author = {Luo, J.  and Caputo, B.  and Zweig, A.  and Back, J. -H.  and Anemuller, J. },
booktitle = {International Conference on Computer Vision Systems (ICVS08)},
year = {2008},
keywords = {IM2.AP, Report_VII},
abstract = {Categorization is one of the fundamental building blocks of cognitive systems. Object categorization has traditionally been addressed in the vision domain, even though cognitive agents are intrinsically multimodal. Indeed, biological systems combine several modalities in order to achieve robust categorization. In this paper we propose a multimodal approach to object category detection, using audio and visual information. The auditory channel is modeled on biologically motivated spectral features via a discriminative classifier. The visual channel is modeled by a state of the art part based model. Multimodality is achieved using two fusion schemes, one high level and the other low level. Experiments on six different object categories, under increasingly difficult conditions, show strengths and weaknesses of the two approaches, and clearly underline the open challenges for multimodal category detection.}
}

@book{Lalanne:Springer:2008,
title = {An ego-centric and tangible approach to meeting indexing and browsing},
author = {Lalanne, D.  and Rigamonti, M.  and Ingold, R.  and Ev\'equoz, F.  and Dumas, B. },
booktitle = {An ego-centric and tangible approach to meeting indexing and browsing},
series = {Lecture Notes in Computer Science},
year = {2008},
volume = {Volume 4892},
edition = {Computer Science},
publisher = {Springer Berlin / Heidelberg},
isbn = {978-3-540-78154-7},
doi = {10.1007/978-3-540-78155-4},
keywords = {IM2.DMA, Report_VII},
abstract = {This article presents an ego-centric approach for indexing and browsing meetings. The method considers two concepts: meetings data alignment with personal information to enable ego-centric browsing and live intentional annotation of meetings through tangible actions to enable ego-centric indexing. The article first motivates and introduces these concepts and further presents brief states-of-the-art of the domain of tangible user interaction, of document-centric multimedia browsing, a traditional tangible object to transport information, and of personal information management. The article then presents our approach in the context of meeting and details our methods to bridge the gap between meeting data and personal information. Finally the article reports the progress of the integration of this approach within Fribourgs meeting room.}
}

@techreport{Thomas_Idiap-RR-04-2009,
title = {Phoneme Recognition Using Spectral Envelope and Modulation Frequency Features},
author = {Thomas, S.  and Ganapathy, S.  and Hermansky, H. },
year = {2009},
type = {Idiap-RR},
number = {Idiap-RR-04-2009},
institution = {Idiap},
keywords = {IM2.AP, Report_VIII},
abstract = {We present a new feature extraction technique for phoneme recognition that uses short-term spectral envelope and modulation frequency features. These features are derived from sub-band temporal envelopes of speech estimated using Frequency Domain Linear Prediction (FDLP). While spectral envelope features are obtained by the short-term integration of the sub-band envelopes, the modulation frequency components are derived from the long-term evolution of the sub-band envelopes. These features are combined at the phoneme posterior level and used as features for a hybrid HMM-ANN phoneme recognizer. For the phoneme recognition task on the TIMIT database, the proposed features show an improvement of 4.7\% over the other feature extraction techniques.},
projects = {AMIDA},
}

@article{vinciarelli:tmm-2:2007,
title = {Role recognition in broadcast news using social network analysis and duration distribution modeling},
author = {Vinciarelli, A. },
journal = {IEEE Transactions on Multimedia},
year = {2007},
note = {IDIAP-RR 06-35},
keywords = {Report_VI, IM2.AP.MCA, joint publucation},
abstract = {This paper presents two approaches for speaker role recognition in multiparty audio recordings. The experiments are performed over a corpus of 96 radio bulletins corresponding to roughly 19 hours of material. Each recording involves, on average, eleven speakers playing one among six roles belonging to a predefined set. Both proposed approaches start by segmenting automatically the recordings into single speaker segments, but perform role recognition using different techniques. The first approach is based on Social Network Analysis, the second relies on the intervention duration distribution across different speakers. The two approaches are used separately and combined and the results show that around 85 percent of the recordings time can be labeled correctly in terms of role.},
ipdmembership = {vision},
ipdxref = {techreport:vinciarelli-idiap-rr-06-35.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/vinciarelli-tmm-2-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/vinciarelli-tmm-2-2007.ps.gz}
}

@inproceedings{Armstrong-3-ISSCO,
title = {A development environment for building grammar-based speech-enabled applications},
author = {Kron, E.  and Rayner, M.  and Santaholma, M.  and Bouillon, P. },
booktitle = {Proceedings of workshop on Grammar-based approaches to spoken language processing},
year = {2007},
pages = {49--52},
publisher = {ACL 2007},
location = {Prague, Czech Republic},
keywords = {Report_VI, IM2.HMI, ACL 2007, June 29}
}

@incollection{millan:2007:visionbook,
title = {Tapping the mind or resonating minds?},
author = {del R. Millán, J. },
editor = {Kidd, P. T. },
booktitle = {European Visions for the Knowledge Age},
year = {2007},
publisher = {Cheshire Henbury},
keywords = {Report_VI, IM2.BMI},
abstract = {Brains interfaced to machines, where thought is used to control and manipulate these machines. This is the vision examined in this chapter. First-generation brain-machine interfaces have already been developed, and technological developments must surely lead to increased capabilities in this field. The most obvious applications for these technologies are those that will assist disabled people. The technology can help restore mobility and communication capabilities, thus helping disabled people to increase their independence and facilitate their participation in society. But how should this technology be employed: just to manipulate the world or also to leverage self-knowledge? And what will the technology mean for the rest of the population? These are some of the questions that are addressed in this chapter.},
ipdmembership = {learning}
}

@inproceedings{Bruno2007,
title = {Combining multimodal preferences for multimedia information retrieval},
author = {Bruno, E.  and Kludas, J.  and Marchand-Maillet, S. },
booktitle = {Proc. of International Workshop on Multimedia Information Retrieval},
year = {2007},
keywords = {Report_VII, IM2.MCA}
}

@techreport{uldry:rr07-04,
title = {Feature selection methods on distributed linear inverse solutions for a non-invasive brain-machine interface},
author = {Uldry, L.  and Ferrez, P. W.  and del R. Millán, J. },
year = {2007},
type = {IDIAP-COM},
number = {04},
institution = {IDIAP},
keywords = {Report_VI, IM2.BMI},
ipdmembership = {learning},
pdf = {ftp://ftp.idiap.ch/pub/reports/2007/uldry-idiap-com-07-04.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2007/uldry-idiap-com-07-04.ps.gz}
}

@techreport{degreve:rr07-22,
title = {Keyword spotting on word lattices},
author = {Zacharie, D. G.  and Pinto, J. P. },
year = {2007},
type = {IDIAP-RR},
number = {22},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2007/degreve-idiap-rr-07-22.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2007/degreve-idiap-rr-07-22.ps.gz}
}

@inproceedings{liwicki06hmmBased,
title = {HMM-based on-line recognition of handwritten whiteboard notes},
author = {Liwicki, M.  and Bunke, H. },
booktitle = {Proceedings 10th International Workshop Frontiers in Handwriting Recognition},
year = {2006},
pages = {595--599},
keywords = {Report_VI, IM2.VP},
peer = {yes}
}

@techreport{norman:rr06-25,
title = {Estimating the confidence interval of expected performance curve in biometric authentication using joint bootstrap},
author = {Poh, N.  and Bengio, S. },
year = {2006},
type = {IDIAP-RR},
number = {25},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {Report_VI, IM2.MPR},
abstract = {Evaluating biometric authentication performance is a complex task because the performance depends on the user set size, composition and the choice of samples. We propose to reduce the performance dependency of these three factors by deriving appropriate confidence intervals. In this study, we focus on deriving a confidence region based on the recently proposed Expected Performance Curve (EPC). An EPC is different from the conventional DET or ROC curve because an EPC assumes that the test class-conditional (client and impostor) score distributions are unknown and this includes the choice of the decision threshold for various operating points. Instead, an EPC selects thresholds based on the training set and applies them on the test set. The proposed technique is useful, for example, to quote realistic upper and lower bounds of the decision cost function used in the NIST annual speaker evaluation. Our findings, based on the 24 systems submitted to the NIST2005 evaluation, show that the confidence region obtained from our proposed algorithm can correctly predict the performance of an unseen database with two times more users with an average coverage of 95\% (over all the 24 systems). A coverage is the proportion of the unseen EPC covered by the derived confidence interval.},
ipdmembership = {learning},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/norman-idiap-rr-06-25.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/norman-idiap-rr-06-25.ps.gz}
}

@proceedings{grandjean2008:riederalp,
title = {Multimodality in emotions and for their assessment},
author = {Grandjean, D.  and Pun, T. },
editor = {Grandjean, D.  and Pun, T. },
year = {2008},
note = {Workshop at Joint (IM)2-Interactive Multimodal Information Management and Affective Sciences NCCRs meeting},
keywords = {Report_VII, IM2.MCA}
}

@inproceedings{morrison2008:cbmi,
title = {Semantic clustering of images using patterns of relevance feedback},
author = {Morrison, D.  and Marchand-Maillet, S.  and Bruno, E. },
booktitle = {Proceedings of the 6th International Workshop on Content-based Multimedia Indexing (CBMI'2008)},
year = {2008},
keywords = {Report_VII, IM2.MCA}
}

@article{Caputo_ELCVIA_2008,
title = {Class specific object recognition using kernel Gibbs distributions},
author = {Caputo, B. },
journal = {ELectronic Letters on Computer vision and Image Analysis},
year = {2008},
volume = {7},
number = {2},
pages = {96--109},
note = {Special Issue on Computational Modelling of Objects Represented in Images},
keywords = {IM2.BMI, Report_VIII},
projects = {Idiap},
}

@inproceedings{Melichar2006/LIA,
title = {From Vocal to Multimodal Dialogue Management},
author = {Melichar, M.  and Cenek, P.  and Ailomaa, M.  and Lisowska, A.  and Rajman, M. },
booktitle = {Eighth International Conference on Multimodal Interfaces (ICMI'06)},
year = {2006},
keywords = {Report_VI, IM2.HMI},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/search.py?recid=98327},
oai-id = {oai:infoscience.epfl.ch:98327},
oai-set = {conf},
status = {PUBLISHED},
unit = {LIA}
}

@inproceedings{JayagopiICME08,
title = {Characterizing conversational group dynamics using nonverbal behavior},
author = {Jayagopi, D.  and Raducanu, B.  and Gatica-Perez, D. },
booktitle = {Proc. IEEE Int. Conf. on Multimedia (ICME)},
year = {2008},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{indermuehle08recognition,
title = {Recognition of handwritten historical documents: hmm -adaptation vs. writer specific training},
author = {Indermühle, E.  and Liwicki, M.  and Bunke, H. },
booktitle = {Proc. 11th Int. Conf. on Frontiers in Handwriting Recognition},
year = {2008},
pages = {186--191},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@article{Meynet2006_1566/LTS,
title = {Mixtures of Boosted Classifiers for Frontal Face Detection},
author = {Meynet, J.  and Popovici, V.  and Thiran, J. -Ph. },
journal = {Signal, Image and Video Processing},
year = {2007},
volume = {1},
number = {1},
pages = {29--38},
url = {http://infoscience.epfl.ch/getfile.py?recid=91045&mode=best},
doi = {10.1007/s11760-007-0003-x},
keywords = {Report_VII, IM2.VP, combination of classifiers; face detection; gaussian features; lts5},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/91045},
oai-id = {oai:infoscience.epfl.ch:91045},
oai-set = {article; fulltext},
status = {PUBLISHED},
unit = {LTS}
}

@inproceedings{eth_biwi_00531,
title = {Towards mass-produced building models},
author = {van Gool, L.  and Zeng, G.  and van den Borre, F.  and Müller, P. },
editor = {Stilla, U.  and Mayer, H.  and Rottensteiner, F.  and Heipke, C.  and Hinz, S. },
booktitle = {Photogrammetric Image Analysis},
year = {2007},
pages = {209--220},
publisher = {Institute of Photogrammetry and Cartography, Technische Universitaet Muenchen},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{BrunoDumas200911,
title = {Benchmarking fusion engines of multimodal interactive systems},
author = {Dumas, B.  and Lalanne, D.  and Ingold, R. },
booktitle = {Proceedings of International Conference on Multimodal Interfaces and Workshop on Machine Learning for Multi-modal Interaction (ICMI-MLMI 2009)},
year = {2009},
keywords = {IM2.HMI, Report_VIII}
}

@inproceedings{eth_biwi_00532,
title = {Action snippets: how many frames does human action recognition require?},
author = {Schindler, K.  and van Gool, L. },
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR'08)},
year = {2008},
publisher = {IEEE Press},
keywords = {Report_VII, IM2.VP}
}

@incollection{Grangier_WILEY_2009,
title = {Discriminative Keyword Spotting},
author = {Grangier, D.  and Keshet, J.  and Bengio, S. },
editor = {Keshet, J.  and Bengio, S. },
booktitle = {Automatic Speech and Speaker Recognition: Large Margin and Kernel Methods},
year = {2009},
publisher = {John Wiley and Sons},
keywords = {IM2.AP, Report_VIII},
abstract = {This chapter introduces a discriminative method for detecting and spotting keywords in spoken utterances. Given a word represented as a sequence of phonemes and a spoken utterance, the keyword spotter predicts the best time span of the phoneme sequence in the spoken utterance along with a confidence. If the prediction confidence is above certain level the keyword is declared to be spoken in the utterance within the predicted time span, otherwise the keyword is declared as not spoken. The problem of keyword spotting training is formulated as a discriminative task where the model parameters are chosen so the utterance in which the keyword is spoken would have higher confidence than any other spoken utterance in which the keyword is not spoken. It is shown theoretically and empirically that the proposed training method resulted with a high area under the receiver operating characteristic (ROC) curve, the most common measure to evaluate keyword spotters. We present an iterative algorithm to train the keyword spotter efficiently. The proposed approach contrasts with standard spotting strategies based on HMMs, for which the training procedure does not maximize a loss directly related to the spotting performance. Several experiments performed on TIMIT and WSJ corpora show the advantage of our approach over HMM-based alternatives.},
projects = {Idiap}
}

@incollection{pop2009-thiran,
title = {Multimodal database annotation formats and standards, software architecture for multimodal interfaces},
author = {Popescu-Belis, A. },
editor = {Thiran, J. -Ph.  and Bourlard, H.  and Marques, F. },
booktitle = {Multimodal Signal Processing: Methods and Techniques to Build Multimodal Interactive Systems},
year = {\bibnodate},
note = {in press},
keywords = {IM2.DMA, Report_VIII}
}

@inproceedings{eth_biwi_00544,
title = {Using recognition to guide a robot's attention},
author = {Thomas, A.  and Ferrari, V.  and Leibe, B.  and Tuytelaars, T.  and van Gool, L. },
booktitle = {Robotics Science and Systems},
year = {2008},
note = {in press},
keywords = {Report_VII, IM2.VP}
}

@article{millan:2008:ijprai,
title = {Non-invasive brain-machine interaction},
author = {Millán, J. del R.  and Ferrez, P. W.  and Galán, F.  and Lew, E.  and Chavarriaga, R. },
journal = {International Journal of Pattern Recognition and Artificial Intelligence},
year = {2008},
keywords = {IM2.BMI, Report_VII},
abstract = {The promise of Brain-Computer Interfaces (BCI) technology is to augment human capabilities by enabling interaction with computers through a conscious and spontaneous modulation of the brainwaves after a short training period. Indeed, by analyzing brain electrical activity online, several groups have designed brain-actuated devices that provide alternative channels for communication, entertainment and control. Thus, a person can write messages using a virtual keyboard on a computer screen and also browse the internet. Alternatively, subjects can operate simple computer games, or brain games, and interact with educational software. Work with humans has shown that it is possible for them to move a cursor and even to drive a wheelchair. This paper briefly reviews the field of BCI, with a focus on non-invasive systems based on electroencephalogram (EEG) signals. It also describes three brain-actuated devices we have developed: a virtual keyboard, a brain game, and a wheelchair. Finally, it shortly discusses current research directions we are pursuing in order to improve the performance and robustness of our BCI system, especially for real-time control of brainactuated robots.}
}

@incollection{millan:2006:mit-lfp,
title = {Non-invasive estimates of local field potentials for brain-computer interfaces},
author = {Peralta Menendez, R. Grave de and González Andino, S. L.  and Ferrez, P. W.  and Millán, J. del R. },
editor = {Dornhege, G.  and Millán, J. del R.  and Hinterberger, T.  and McFarland, D.  and Müller, K. -R. },
booktitle = {Towards Brain-Computer Interfacing},
year = {2007},
publisher = {The MIT Press},
keywords = {IM2.BCI, Report_VII},
abstract = {Recent experiments have shown the possibility to use the brain electrical activity to directly control the movement of robots or prosthetic devices in real time. Such neuroprostheses can be invasive or non-invasive, depending on how the brain signals are recorded. In principle, invasive approaches will provide a more natural and flexible control of neuroprostheses, but their use in humans is debatable given the inherent medical risks. Non-invasive approaches mainly use scalp electroencephalogram (EEG) signals and their main disadvantage is that these signals represent the noisy spatiotemporal overlapping of activity arising from very diverse brain regions; i.e., a single scalp electrode picks up and mixes the temporal activity of myriads of neurons at very different brain areas. In order to combine the benefits of both approaches, we propose to rely on the non-invasive estimation of local field potentials (eLFP) in the whole human brain from the scalp measured EEG data using a recently developed inverse solution (ELECTRA) to the EEG inverse problem. The goal of a linear inverse procedure is to deconvolve or unmix the scalp signals attributing to each brain area its own temporal activity. To illustrate the advantage of this approach we compare, using identical set of spectral features, classification of rapid voluntary finger self-tapping with left and right hands based on scalp EEG and eLFP on three subjects using different number of electrodes. It is shown that the eLFP-based Gaussian classifier outperforms the EEG-based Gaussian classifier for the three subjects.}
}

@inproceedings{Aradilla_ICASSP_2009,
title = {Posterior features applied to speech recognition tasks with user-defined vocabulary},
author = {Aradilla, G.  and Bourlard, H.  and Magimai-Doss, M. },
booktitle = {Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
year = {2009},
keywords = {IM2.AP, Report_VIII},
projects = {SNSF-MULTI,
AMIDA,
IM2},
}

@techreport{mesot:rr06-55,
title = {A bayesian alternative to gain adaptation in autoregressive hidden markov models},
author = {Mesot, B.  and Barber, D. },
year = {2006},
type = {IDIAP-RR},
number = {55},
institution = {IDIAP},
keywords = {Report_VI, IM2.MPR},
abstract = {Models dealing directly with the raw acoustic speech signal are an alternative to conventional feature-based HMMs. A popular way to model the raw speech signal is by means of an autoregressive (AR) process. Being too simple to cope with the nonlinearity of the speech signal, the AR process is generally embedded into a more elaborate model, such as the switching autoregressive HMM (SAR-HMM). A fundamental issue faced by models based on AR processes is that they are very sensitive to variations in the amplitude of the signal. One way to overcome this limitation is to use Gain Adaptation to adjust the amplitude by maximising the likelihood of the observed signal. However, adjusting model parameters by maximising test likelihoods is fundamentally outside the framework of standard statistical approaches to machine learning, since this may lead to overfitting when the models are sufficiently flexible. We propose a statistically principled alternative based on an exact Bayesian procedure in which priors are explicitly defined on the parameters of the AR process. Explicitly, we present the Bayesian SAR-HMM and compare the performance of this model against the standard Gain-Adapted SAR-HMM on a single digit recognition task, showing the effectiveness of the approach and suggesting thereby a principled and straightforward solution to the issue of Gain Adaptation.},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/mesot-idiap-rr-06-55.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/mesot-idiap-rr-06-55.ps.gz}
}

@inproceedings{bengio:mcs:2007,
title = {Biometric person authentication is a multiple classifier problem},
author = {Bengio, S.  and Mari\'ethoz, J. },
booktitle = {7th International Workshop on Multiple Classifier Systems, MCS},
year = {2007},
note = {IDIAP-RR 07-03},
keywords = {Report_VI, IM2.MPR},
abstract = {Several papers have already shown the interest of using multiple classifiers in order to enhance the performance of biometric person authentication systems. In this paper, we would like to argue that the core task of Biometric Person Authentication is actually a multiple classifier problem as such: indeed, in order to reach state-of-the-art performance, we argue that all current systems , in one way or another, try to solve several tasks simultaneously and that without such joint training (or sharing), they would not succeed as well. We explain hereafter this perspective, and according to it, we propose some ways to take advantage of it, ranging from more parameter sharing to similarity learning.},
ipdmembership = {learning},
ipdxref = {techreport:bengio-idiap-rr-07-03.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/bengio-mcs-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/bengio-mcs-2007.ps.gz}
}

@inproceedings{Kaufmann2007,
title = {Applying licenser rules to a grammar with continuous constituents},
author = {Kaufmann, T.  and Pfister, B. },
booktitle = {The Proceedings of the 14th International Conference on Head-Driven Phrase Structure Grammar},
year = {2007},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@article{silviachiappa:ieee_spl:2007,
title = {Bayesian factorial linear gaussian state-space models for biosignal decomposition},
author = {Chiappa, S.  and Barber, D. },
journal = {IEEE Signal Processing Letters},
year = {2007},
note = {IDIAP-RR 05-84},
keywords = {Report_VI, IM2.BMI},
abstract = {We discuss a method to extract independent dynamical systems underlying a single or multiple channels of observation. In particular, we search for one dimensional subsignals to aid the interpretability of the decomposition. The method uses an approximate Bayesian analysis to determine automatically the number and appropriate complexity of the underlying dynamics, with a preference for the simplest solution. We apply this method to unfiltered EEG signals to discover low complexity sources with preferential spectral properties, demonstrating improved interpretability of the extracted sources over related methods.},
ipdmembership = {learning},
ipdxref = {techreport:silviachiappa-idiap-rr-05-84.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/silviachiappa-ieee_spl-2007.pdf}
}

@inproceedings{IWINAC2007,
title = {Identifying major components of pictures by audio encoding of colors},
author = {Bologna, G.  and Deville, B.  and Pun, T.  and Vinckenbosch, M. },
booktitle = {IWINAC2007, 2nd. Int. Work-conf. on the Interplay between Natural and Artificial Computation},
year = {2007},
keywords = {Report_VI, IM2.MPR}
}

@techreport{keller:rr06-44,
title = {A multitask learning approach to document representation using unlabeled data},
author = {Keller, M.  and Bengio, S. },
year = {2006},
type = {IDIAP-RR},
number = {44},
institution = {IDIAP},
keywords = {Report_VI, IM2.MPR.MCA, joint publication},
abstract = {Text categorization is intrinsically a supervised learning task, which aims at relating a given text document to one or more predefined categories. Unfortunately, labeling such databases of documents is a painful task. We present in this paper a method that takes advantage of huge amounts of unlabeled text documents available in digital format, to counter balance the relatively smaller available amount of labeled text documents. A Siamese MLP is trained in a multi-task framework in order to solve two concurrent tasks: using the unlabeled data, we search for a mapping from the documents' bag-of-word representation to a new feature space emphasizing similarities and dissimilarities among documents; simultaneously, this mapping is constrained to also give good text categorization performance over the labeled dataset. Experimental results on Reuters RCV1 suggest that, as expected, performance over the labeled task increases as the amount of unlabeled data increases.},
ipdmembership = {learning},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/keller-idiap-rr-06-44.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/keller-idiap-rr-06-44.ps.gz}
}

@incollection{eth_biwi_bfasel_07a,
title = {Interactive museum guide: accurate retrieval of object descriptions},
author = {Fasel, B.  and van Gool, L. },
editor = {Marchand-Maillet, S.  and Bruno, E.  and Nürnberger, A.  and Detyniecki, M. },
booktitle = {Adaptive Multimedia Retrieval: User, Context, and Feedback},
year = {2007},
pages = {179--191},
publisher = {Springer},
keywords = {Report_VI, IM2.VP}
}

@inproceedings{MRCS2006,
title = {Emotion assessment: arousal evaluation using eeg's and peripheral physiological signals},
author = {Chanel, G.  and Kronegg, J.  and Grandjean, D.  and Pun, T. },
editor = {Gunsel, B.  and Jain, A. K.  and Tekalp, A. M.  and Sankur, B. },
booktitle = {Proc. Int. Workshop Multimedia Content Representation, Classification and Security (MRCS)},
year = {2006},
volume = {4105},
pages = {530--537},
publisher = {Lecture Notes in Computer Science, Springer},
keywords = {Report_VI, IM2.MPR}
}

@inproceedings{Jayagopi:ICMI:2008,
title = {Predicting two facets of social verticality in meetings from five-minute time slices and nonverbal cues},
author = {Jayagopi, D.  and Ba, S.  and Odobez, J. -M.  and Gatica-Perez, D. },
booktitle = {Proc. Int. Conf. on Multimodal Interfaces (ICMI), Special Session on Social Signal Processing},
year = {2008},
keywords = {IM2.VP, Report_VIII}
}

@inproceedings{Boakye2008a,
title = {Overlapped speech detection for improved speaker diarization in multiparty meetings},
author = {Boakye, K.  and Trueba-Hornero, B.  and Vinyals, O.  and Friedland, G. },
booktitle = {International Conference on Acoustics, Speech, and Signal Processing},
year = {2008},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@incollection{Keshet_WILEY_2009,
title = {A Large Margin Algorithm for Forced Alignment},
author = {Keshet, J.  and Shalev-Shwartz, S.  and Singer, Y.  and Chazan, D. },
editor = {Keshet, J.  and Bengio, S. },
booktitle = {Automatic Speech and Speaker Recognition: Large Margin and Kernel Methods},
year = {2009},
publisher = {John Wiley and Sons},
keywords = {IM2.AP, Report_VIII},
abstract = {We describe and analyze a discriminative algorithm for learning to align a phoneme sequence of a speech utterance with its acoustical signal counterpart by predicting a timing sequence representing the phoneme start times. In contrast to common HMM-based approaches, our method employs a discriminative learning procedure in which the learning phase is tightly coupled with the forced alignment task. The alignment function we devise is based on mapping the input acoustic-symbolic representations of the speech utterance along with the target timing sequence into an abstract vector space. We suggest a specific mapping into the abstract vector-space which utilizes standard speech features (e.g. spectral distances) as well as confidence outputs of a frame-based phoneme classifier. Generalizing the notion of separation with a margin used in support vector machines (SVM) for binary classification, we cast the learning task as the problem of finding a vector in an abstract inner-product space. We set the prediction vector to be the solution of a minimization problem with a large set of constraints. Each constraint enforces a gap between the projection of the correct target timing sequence and the projection of an alternative, incorrect, timing sequence onto the vector. Though the number of constraints is very large, we describe a simple iterative algorithm for efficiently learning the vector and analyze the formal properties of the resulting learning algorithm. We report experimental results comparing the proposed algorithm to previous studies on forced alignment, which use hidden Markov models (HMM). The results obtained in our experiments using the discriminative alignment algorithm outperform the state-of-the-art systems on the TIMIT corpus.},
projects = {Idiap}
}

@book{Keshet_WILEY_2008,
title = {Automatic speech and speaker recognition: large margin and kernel methods},
author = {Keshet, J.  and Bengio, S. },
booktitle = {Large Margin Methods for Part of Speech Tagging},
year = {2008},
publisher = {John Wiley & Sons},
keywords = {IM2.AP, Report_VII},
abstract = {This is the first book dedicated to uniting research related to speech and speaker recognition based on the recent advances in large margin and kernel methods. The first part of the book presents theoretical and practical foundations of large margin and kernel methods, from support vector machines to large margin methods for structured learning. The second part of the book is dedicated to acoustic modeling of continuous speech recognizers, where the grounds for practical large margin sequence learning are set. The third part introduces large margin methods for discriminative language modeling. The last part of the book is dedicated to the application of keyword spotting, speaker verification and spectral clustering. The book is an important reference to researchers and practitioners in the field of modern speech and speaker recognition. The purpose of the book is twofold; first, to set the theoretical foundation of large margin and kernel methods relevant to speech recognition domain; second, to propose a practical guide on implementation of these methods to the speech recognition domain. The reader is presumed to have basic knowledge of large margin and kernel methods and of basic algorithms in speech and speaker recognition.}
}

@incollection{bertolami08emsemble,
title = {Ensemble methods to improve the performance of an english handwritten text line recognizer},
author = {Bertolami, R.  and Bunke, H. },
editor = {Doerman, D.  and Jaeger, S. },
booktitle = {Arabic and Chinese Handwriting Recognition},
series = {LNCS 4768},
year = {2008},
pages = {265--277},
publisher = {Springer},
keywords = {Report_VII, IM2.VP},
peer = {yes}
}

@inproceedings{Li_ICASSP_2009,
title = {Non-linear mapping for multi-channel speech separation and robust overlapping speech recognition},
author = {Li, W.  and Dines, J.  and Magimai-Doss, M.  and Bourlard, H. },
booktitle = {Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
year = {2009},
month = {April},
keywords = {binary masking, microphone array, neural network, overlapping speech recognition, speech separation, IM2.AP, Report_VIII},
abstract = {This paper investigates a non-linear mapping approach to extract robust features for ASR and separation of overlapping speech. Based on our previous studies, we continue to use two additional sound sources, namely, from the target and interfering speakers. The focues of this work are: 1) We investigate the feature mapping between different domains with the consideration of MMSE criterion and regression optimizations, demonstrating the mapping of log mel-filterbank energies to MFCC can be exploited to improve the effectiveness of the regression; 2) We investigate the data-driven filtering for the speech separation by using the mapping method, which can be viewed as a generalized log spectral subtraction and results in better separation performance. We demonstrate the effectiveness of the proposed approach through extensive evaluations on the MONC corpus, which includes both non-overlapping single speaker and overlapping multi-speaker conditions.},
projects = {Idiap,
AMIDA,
IM2},
}

@inproceedings{MMSPL-CONF-20096-001,
title = {A subjective study of the influence of color information on visual quality assessment of high resolution pictures},
author = {De Simone, F.  and Dufaux, F.  and Ebrahimi, T.  and Delogu, C.  and Baroncini, V. },
booktitle = {Fourth International Workshop on Video Processing and Quality Metrics for Consumer Electronics (VPQM-09)},
year = {2009},
location = {Scottsdale, Arizona, USA},
url = {http://infoscience.epfl.ch/getfile.py?recid=130751&mode=best},
keywords = {IM2.MCA, Report_VIII},
abstract = {This paper presents the design and the results of a psychovisual experiment which aims at understanding how the color information affects the perceived quality of a high resolution still picture. The results of this experiment help to shed light into the importance of color for human observers and could be used to improve the performance of objective quality metrics.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/130751},
oai-id = {oai:infoscience.epfl.ch:130751},
oai-set = {conf; fulltext; fulltext-public},
review = {REVIEWED},
status = {PUBLISHED},
unit = {MMSPL}
}

@article{Shahrokni_TPAMI_2008,
title = {Classification-based Probabilistic Modeling of Texture Transition for Fast Line Search Tracking and Delineation},
author = {Shahrokni, A.  and Drummond, T.  and Fleuret, F.  and Fua, P. },
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
year = {2008},
keywords = {IM2.BMI, Report_VIII},
projects = {Idiap,
IM2}
}

@inproceedings{millan:2007:ijcai,
title = {Augmenting astronaut's capabilities through brain-machine interfaces},
author = {Broschart, M.  and de Negueruela, C.  and Millán, J. del R.  and Menon, C. },
booktitle = {Proceedings of the 20th International Joint Conference on Artificial Intelligence, Workshop on Artificial Intelligence for Space Applications},
year = {2007},
keywords = {IM2.BCI, Report_VI},
abstract = {Brain-Machine Interfaces (BMIs) transform the brain activity of a human operator into executable commands that can be sent to a machine, usually a computer or robot, to perform intended tasks. In addition to current biomedical applications, available technology could also make feasible augmenting devices for space applications that could be promising means to improve astronauts' efficiency and capabilities. The implementation of artificial intelligence algorithms into the software architecture of present BMIs will be of crucial importance to guarantee a proper functionality of the device in the highly dynamic and unpredictable space environment.}
}

@inproceedings{Drygajlo6,
title = {What do quality measures predict in biometrics},
author = {Kryszczuk, K.  and Drygajlo, A. },
journal = {16th European Signal Processing Conference},
year = {2008},
pages = {-,--29},
keywords = {IM2.MPR, Report_VIII}
}

@phdthesis{lathoud-thesis,
title = {Spatio-temporal analysis of spontaneous speech with microphone arrays},
author = {Lathoud, G. },
year = {2006},
school = {\'Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {PhD Thesis #3689 at the \'Ecole Polytechnique F\'ed\'erale de Lausanne (IDIAP-RR 06-77)},
keywords = {Report_VI, IM2.AP.VP, joint publication},
abstract = {Accurate detection, localization and tracking of multiple moving speakers permits a wide spectrum of applications. Techniques are required that are versatile, robust to environmental variations, and not constraining for non-technical end-users. Based on distant recording of spontaneous multiparty conversations, this thesis focuses on the use of microphone arrays to address the question Who spoke where and when?. The speed, the versatility and the robustness of the proposed techniques are tested on a variety of real indoor recordings, including multiple moving speakers as well as seated speakers in meetings. Optimized implementations are provided in most cases. We propose to discretize the physical space into a few sectors, and for each time frame, to determine which sectors contain active acoustic sources (Where? When?). A topological interpretation of beamforming is proposed, which permits both to evaluate the average acoustic energy in a sector for a negligible cost, and to locate precisely a speaker within an active sector. One additional contribution that goes beyond the eld of microphone arrays is a generic, automatic threshold selection method, which does not require any training data. On the speaker detection task, the new approach is dramatically superior to the more classical approach where a threshold is set on training data. We use the new approach into an integrated system for multispeaker detection-localization. Another generic contribution is a principled, threshold-free, framework for short-term clustering of multispeaker location estimates, which also permits to detect where and when multiple trajectories intersect. On multi-party meeting recordings, using distant microphones only, short-term clustering yields a speaker segmentation performance similar to that of close-talking microphones. The resulting short speech segments are then grouped into speaker clusters (Who?), through an extension of the Bayesian Information Criterion to merge multiple modalities. On meeting recordings, the speaker clustering performance is signicantly improved by merging the classical mel-cepstrum information with the short-term speaker location information. Finally, a close analysis of the speaker clustering results suggests that future research should investigate the effect of human acoustic radiation characteristics on the overall transmission channel, when a speaker is a few meters away from a microphone.},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/rr-06-77.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/rr-06-77.ps.gz}
}

@article{monay:pami:2007,
title = {Modeling semantic aspects for cross-media image indexing},
author = {Monay, F.  and Gatica-Perez, D. },
crossref = {monay:rr05-56},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
year = {2007},
volume = {29},
pages = {1802--1817},
issn = {0162-8828},
note = {IDIAP-RR 05-56},
doi = {10.1109/tpami.2007.1097},
keywords = {IM2.MCA, Report_VII},
abstract = {To go beyond the query-by-example paradigm in image retrieval, there is a need for semantic indexing of large image collections for intuitive text-based image search. Different models have been proposed to learn the dependencies between the visual content of an image set and the associated text captions, then allowing for the automatic creation of semantic indices for unannotated images. The task, however, remains unsolved. In this paper, we present three alternatives to learn a Probabilistic Latent Semantic Analysis model (PLSA) for annotated images, and evaluate their respective performance for automatic image indexing. Under the PLSA assumptions, an image is modeled as a mixture of latent aspects that generates both image features and text captions, and we investigate three ways to learn the mixture of aspects. We also propose a more discriminative image representation than the traditional Blob histogram, concatenating quantized local color information and quantized local texture descriptors. The first learning procedure of a PLSA model for annotated images is a standard EM algorithm, which implicitly assumes that the visual and the textual modalities can be treated equivalently. The other two models are based on an asymmetric PLSA learning, allowing to constrain the definition of the latent space on the visual or on the textual modality. We demonstrate that the textual modality is more appropriate to learn a semantically meaningful latent space, which translates into improved annotation performance. A comparison of our learning algorithms with respect to recent methods on a standard dataset is presented, and a detailed evaluation of the performance shows the validity of our framework.}
}

@inproceedings{Armstrong-11-ISSCO,
title = {Exploiting structural meeting-specific features for topic segmentation},
author = {Georgescul, M.  and Clark, A.  and Armstrong, S. },
booktitle = {Actes de la 14\eme Conf\'erence sur le Traitement Automatique des Langues Naturelles},
year = {2007},
location = {Toulouse, France},
keywords = {Report_VI, IM2.MCA, major}
}

@inproceedings{Humm07:icdar,
title = {Spoken handwriting verification using statistical models},
author = {Humm, A.  and Hennebert, J.  and Ingold, R. },
booktitle = {Accepted for publication, International Conference on Document Analysis and Recognition (ICDAR 07), Curitiba Brazil},
year = {2007},
keywords = {Report_VI, IM2.MPR}
}

@article{LTS-ARTICLE-2007-004,
title = {Accelerating Distributed Consensus Using Extrapolation},
author = {Kokiopoulou, E.  and Frossard, P. },
journal = {IEEE Signal Processing Letters},
year = {2007},
volume = {14},
number = {10},
url = {http://infoscience.epfl.ch/getfile.py?recid=100858&mode=best},
doi = {na},
keywords = {Report_VI, IM2.VP},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/search.py?recid=100858},
oai-id = {oai:infoscience.epfl.ch:100858},
oai-set = {article},
review = {REVIEWED},
status = {PUBLISHED},
unit = {LTS}
}

@inproceedings{frinken09self,
title = {Self-training strategies for handwriting word recognition},
author = {Frinken, V.  and Bunke, H. },
booktitle = {Proc. Industrial Conf. Advances in Data Mining. Applications and Theoretical Aspects},
series = {LNCS 5633},
year = {2009},
pages = {291--300},
publisher = {Springer},
isbn = {978-3-642-03066-6},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@techreport{Yao_Idiap-RR-19-2009,
title = {Fast Human Detection in Videos using Joint Appearance and Foreground Learning from Covariances of Image Feature Subsets},
author = {Yao, J.  and Odobez, J. -M. },
year = {2009},
type = {Idiap-RR},
number = {Idiap-RR-19-2009},
institution = {Idiap},
keywords = {IM2.VP, Report_VIII},
abstract = {We present a fast method to detect humans from stationary surveillance videos. Traditional approaches exploit background subtraction as an attentive filter, by applying the still image detectors only on foreground regions. This doesn't take into account that foreground observations contain human shape information which can be used for detection. To address this issue, we propose a method that learn the correlation between appearance and foreground information. It is based on a cascade of LogitBoost classifiers which uses covariance matrices computed from appearance and foreground features as object descriptors. We account for the fact that covariance matrices lie in a Riemanian space, introduce different novelties -like exploiting only covariance sub-matrices- to reduce the induced computation load, as well as an image rectification scheme to remove the slant of people in images when dealing with wide angle cameras. Evaluation on a large set of videos shows that our approach performs better than the attentive filter paradigm while processing from 5 to 20 frames/sec. In addition, on the INRIA human (static image) benchmark database, our sub-matrix approach performs better than the full covariance case while reducing the computation cost by more than one order of magnitude.},
}

@techreport{Ba:IDIAP-RR47:2008,
title = {Multi-person visual focus of attention from head pose and meeting contextual cues},
author = {Ba, S.  and Odobez, J. -M. },
year = {2008},
number = {47},
institution = {IDIAP Research Report 47, submitted to the IEEE Transactions on Pattern Analysis and Machine Intelligence, second revision},
keywords = {IM2.VP, Report_VIII}
}

@inproceedings{Parthasarathi_INTERSPEECH_2009,
title = {Investigating Privacy-Sensitive Features for Speech Detection in Multiparty Conversations},
author = {Parthasarathi, S. H. K.  and Magimai-Doss, M.  and Bourlard, H.  and Gatica-Perez, D. },
crossref = {Parthasarathi_Idiap-RR-12-2009},
booktitle = {Proceedings of Interspeech 2009},
year = {2009},
keywords = {IM2.AP, IM2.MCA, Report_VIII},
abstract = {We investigate four different privacy-sensitive features, namely energy, zero crossing rate, spectral flatness, and kurtosis, for speech detection in multiparty conversations. We liken this scenario to a meeting room and define our datasets and annotations accordingly. The temporal context of these features is modeled. With no temporal context, energy is the best performing single feature. But by modeling temporal context, kurtosis emerges as the most effective feature. Also, we combine the features. Besides yielding a gain in performance, certain combinations of features also reveal that a shorter temporal context is sufficient. We then benchmark other privacy-sensitive features utilized in previous studies. Our experiments show that the performance of all the privacy-sensitive features modeled with context is close to that of state-of-the-art spectral-based features, without extracting and using any features that can be used to reconstruct the speech signal.},
projects = {Idiap,
IM2,
SNSF-MULTI,
AMIDA},
}

@inproceedings{Lalanne2007,
title = {Going through digital versus physical augmented gaming},
author = {Lalanne, D.  and Ev\'equoz, F.  and Chiquet, H.  and Müller, M.  and Radgohar, M.  and Ingold, R. },
booktitle = {Tangible Play: Research and Design for Tangible and Tabletop Games. Workshop at the 2007 Intelligent User Interfaces Conference (IUI'07)},
year = {2007},
pages = {41--44},
keywords = {Report_VI, IM2.HMI}
}

@inproceedings{li:rr07-71,
title = {Effective post-processing for single-channel frequency-domain speech enhancement},
author = {Li, W. },
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-71-2007},
pages = {149--152},
institution = {IDIAP},
isbn = {978-1-4244-2570-9},
note = {Submitted for publication},
doi = {10.1109/icme.2008.4607393},
keywords = {IM2.AP, Report_VII},
abstract = {Conventional frequency-domain speech enhancement filters improve signal-to-noise ratio (SNR), but also produce speech distortions. This paper describes a novel post-processing algorithm devised for the improvement of the quality of the speech processed by a conventional filter. In the proposed algorithm, the speech distortion is first compensated by adding the original noisy speech, and then the noise is reduced by a post-filter. Experimental results on speech quality show the effectiveness of the proposed algorithm in lower speech distortions. Based on our isolated word recognition experiments conducted in 15 real car environments, a relative word error rate (WER) reduction of 10.5\% is obtained compared to the conventional filter.}
}

@article{bourdaud:ieee-tnsre:2008,
title = {Characterizing the eeg correlates of exploratory behavior},
author = {Bourlard, H.  and Chavarriaga, R.  and Galán, F.  and Millán, J. del R. },
crossref = {bourdaud:rr08-28},
journal = {IEEE Transactions on Neural Systems & Rehabilitation Engineering},
year = {2008},
note = {IDIAP-RR 08-28},
keywords = {IM2.BMI, Report_VII},
abstract = {This study aims to characterize the EEG correlates of exploratory behavior. Decision making in an uncertain environment raises a conflict between two opposing needs: gathering information about the environment and exploiting this knowledge in order to optimize the decision. Exploratory behavior has already been studied using fMRI. Based on a usual paradigm in reinforcement learning, this study has shown bilateral activation in the frontal and parietal cortex. To our knowledge, no previous study has been done on it using EEG. The study of the exploratory behavior using EEG signals raises two difficulties. First, the labels of trial as exploitation or exploration cannot be directly derived from the subject action. In order to access this information, a model of how the subject makes his decision must be built. The exploration related information can be then derived from it. Second, because of the complexity of the task, its EEG correlates are not necessarily time locked with the action. So the EEG processing methods used should be designed in order to handle signals that shift in time across trials. Using the same experimental protocol as the fMRI study, results show that the bilateral frontal and parietal areas are also the most discriminant. This strongly suggests that the EEG signal also conveys information about the exploratory behavior.}
}

@article{Garipelli_IEEETRANS.BIOMED.ENGG._2008,
title = {Fast recognition of anticipation related potentials},
author = {Garipelli, G.  and Chavarriaga, R.  and Millán, J. del R. },
journal = {IEEE Transactions on Biomedical Engineering},
year = {2008},
note = {In press},
keywords = {IM2.BMI, Report_VII},
abstract = {Anticipation increases the efficiency of daily tasks by partial advance activation of neural substrates involved in it. Here we develop a method for the recognition of electroencephalogram (EEG) correlates of this activation as early as possible on single trials which is essential for Brain-Computer Interaction (BCI). We explore various features from the EEG recorded in a Contingent Negative Variation (CNV) paradigm. We also develop a novel technique called Time Aggregation of Classification (TAC) for fast and reliable decisions that combines the posterior probabilities of several classifiers trained with features computed from temporal blocks of EEG until a certainty threshold is reached. Experiments with 9 naive subjects performing the CNV experiment with GO and NOGO conditions with an inter-stimulus interval of 4 s show that the performance of the TAC method is above 70\% for four subjects, around 60\% for two other subjects, and random for the remaining subjects. On average over all subjects, more than 50\% of the correct decisions are made at 2 s, without needing to wait until 4 s.}
}

@inproceedings{grandvalet:ICML-2:2007,
title = {Sparse probabilistic classifiers},
author = {H\'erault, R.  and Grandvalet, Y. },
booktitle = {International Conference on Machine Learning (ICML)},
year = {2007},
note = {IDIAP-RR 07-19},
keywords = {Report_VI, IM2.MPR},
abstract = {The scores returned by support vector machines are often used as a confidence measures in the classification of new examples. However, there is no theoretical argument sustaining this practice. Thus, when classification uncertainty has to be assessed, it is safer to resort to classifiers estimating conditional probabilities of class labels. Here, we focus on the ambiguity in the vicinity of the boundary decision. We propose an adaptation of maximum likelihood estimation, instantiated on logistic regression. The model outputs proper conditional probabilities into a user-defined interval and is less precise elsewhere. The model is also sparse, in the sense that few examples contribute to the solution. The computational efficiency is thus improved compared to logistic regression. Furthermore, preliminary experiments show improvements over standard logistic regression and performances similar to support vector machines.},
ipdmembership = {learning},
ipdxref = {techreport:grandvalet-idiap-rr-07-19.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/grandvalet-ICML-2-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/grandvalet-ICML-2-2007.ps.gz}
}

@inproceedings{Koval:SPIE2007:EEA,
title = {Error exponent analysis of person identification based on fusion of dependent/independent modalities},
author = {Koval, O.  and Voloshynovskiy, S.  and Pun, T. },
booktitle = {Proceedings of SPIE-IS&T Electronic Imaging 2007, Security, Steganography, and Watermarking of Multimedia Contents IX},
year = {2007},
keywords = {Report_VI, IM2.MPR},
vgclass = {refpap},
vgproject = {watermarking}
}

@article{eth_biwi_00547,
title = {Object detection by global contour shape},
author = {Schindler, K.  and Suter, D. },
journal = {Pattern Recognition},
year = {2008},
keywords = {Report_VII, IM2.VP.MCA, joint}
}

@article{benzeghiba:speechcom:2006,
title = {User-customized password speaker verification using multiple reference and background models},
author = {BenZeghiba, M. F.  and Bourlard, H. },
journal = {Speech Communication},
year = {2006},
volume = {8},
pages = {1200--1213},
note = {IDIAP-RR 04-41},
keywords = {Report_VI, IM2.AP},
abstract = {This paper discusses and optimizes an HMM/GMM based User-Customized Password Speaker Verification (UCP-SV) system. Unlike text-dependent speaker verification, in UCP-SV systems, customers can choose their own passwords with no lexical constraints. The password has to be pronounced a few times during the enrollment step to create a customer dependent model. Although potentially more user-friendly'', such systems are less understood and actually exhibit several practical issues, including automatic HMM inference, speaker adaptation, and efficient likelihood normalization. In our case, HMM inference (HMM topology) is performed using hybrid HMM/MLP systems, while the parameters of the inferred model, as well as their adaptation, will use GMMs. However, the evaluation of a UCP-SV baseline system shows that the background model used for likelihood normalization is the main difficulty. Therefore, to circumvent this problem, the main contribution of the paper is to investigate the use of multiple reference models for customer acoustic modeling and multiple background models for likelihood normalization. In this framework, several scoring techniques are investigated, such as Dynamic Model Selection (DMS) and fusion techniques. Results on two different experimental protocols show that an appropriate selection criteria for customer and background models can improve significantly the UCP-SV performance, making the UCP-SV system quite competitive with a text-dependent SV system. Finally, as customers' passwords are short, a comparative experiment using the conventional GMM-UBM text-independent approach is also conducted.},
ipdmembership = {speech},
ipdxref = {techreport:benzeghiba-idiap-rr-04-41.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2006/benzeghiba-speechcom-2006.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2006/benzeghiba-speechcom-2006.ps.gz}
}

@inproceedings{Armstrong-9-ISSCO,
title = {Generating usable formats for metadata and annotations in a large meeting corpus},
author = {Popescu-Belis, A.  and Estrella, P. },
booktitle = {ACL 2007},
series = {45th International Conference of the for Computation},
year = {2007},
pages = {93--96},
publisher = {ACL 2007},
location = {Prague, Czech Republic},
keywords = {Report_VI, IM2.DMA, major, Interactive Poster and Demonstration Sessions}
}

@inproceedings{Billard1,
title = {Wearcam: a head mounted wireless camera for monitoring gaze attention and for the diagnosis of developmental disorders in young children},
author = {Piccardi, L.  and Noris, B.  and Barbey, O.  and Schiavone, G.  and Keller, F.  and Von Hofsten, C.  and Billard, A. },
booktitle = {16th IEEE International Symposium on Robot & Human Interactive Communication, RO-MAN},
series = {Special Session: Applications of Robotics and Intelligent System},
year = {2007},
keywords = {Report_VI, IM2.MPR},
affiliation = {EPFL}
}

@inproceedings{Vila:ICASSP2006:CPUCA,
title = {Costa problem under channel ambiguity},
author = {Vila-Forc\'en, J. E.  and Voloshynovskiy, S.  and Koval, O.  and Pun, T. },
booktitle = {Proceedings of 2006 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
year = {2006},
url = {http://vision.unige.ch/publications/postscript/2006/VilaVoloshynovskiyKovalPun_ICA SSP06_CPUCA.pdf},
keywords = {Report_VI, IM2.MPR},
vgclass = {refpap},
vgproject = {watermarking}
}

@article{Vila:2004:EURASIP,
title = {Facial Image Compression Based on Structured Codebooks in Overcomplete Domain},
author = {Vila-Forc\'en, J. E.  and Voloshynovskiy, S.  and Koval, O.  and Pun, T. },
journal = {EURASIP Journal on Applied Signal Processing, Frames and overcomplete representations in signal processing, communications, and information theory special issue},
year = {2006},
volume = {2006},
number = {Article ID 69042},
pages = {1--11},
url = {http://vision.unige.ch/publications/postscript/2005/VilaKovalVoloshynovskiyPun_JAS Pcompression_final.pdf},
keywords = {Report_VI, IM2.MPR},
vgclass = {refpap},
vgproject = {watermarking}
}

@inproceedings{Levit2007,
title = {Integrating several annotation layers for statistical information distillation},
author = {Levit, M.  and Hakkani-Tur, D.  and Tur, G.  and Gillick, D. },
booktitle = {Workshop on Automatic Speech Recognition and Understanding},
year = {2007},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@inproceedings{farrahi:iswc:2008,
title = {Discovering human routines from cell phone data with topic models},
author = {Gatica-Perez, D.  and Farrahi, K. },
crossref = {farrahi:rr08-32},
booktitle = {IEEE International Symposium on Wearable Computers (ISWC)},
year = {2008},
location = {Pittsburgh, Pennsylvania},
note = {IDIAP-RR 08-32},
keywords = {IM2.MCA, Report_VII},
abstract = {We present a framework to automatically discover people's routines from information extracted by cell phones. The framework is built from a probabilistic topic model learned on novel bag type representations of activity-related cues (location, proximity and their temporal variations over a day) of peoples' daily routines. Using real-life data from the Reality Mining dataset, covering 68 000 hours of human activities, we can successfully discover location-driven (from cell tower connections) and proximity-driven (from Bluetooth information) routines in an unsupervised manner. The resulting topics meaningfully characterize some of the underlying co-occurrence structure of the activities in the dataset, including going to work early/late", being home all day", working constantly", working sporadically" and meeting at lunch time".}
}

@incollection{Bourlard_SPRINGERMA_2008,
title = {How does a dictation machine recognize speech ?},
author = {Dutoit, T.  and Couvreur, L.  and Bourlard, H. },
crossref = {Bourlard_Idiap-RR-72-2008},
booktitle = {Applied Signal Processing--A MATLAB approach},
year = {2008},
pages = {104--148},
chapter = {4},
publisher = {Springer MA},
keywords = {IM2.AP, Report_VIII},
projects = {Idiap},
}

@inproceedings{McCool_ICB2009_2009,
title = {Parts-Based Face Verification using Local Frequency Bands},
author = {McCool, C.  and Marcel, S. },
crossref = {McCool_Idiap-RR-03-2009},
booktitle = {in Proceedings of IEEE/IAPR International Conference on Biometrics},
year = {2009},
keywords = {IM2.VP, Report_VIII},
projects = {Idiap},
}

@techreport{cheng:rr06-62,
title = {A generalized dynamic composition algorithm of weighted finite state transducers for large vocabulary speech recognition},
author = {Cheng, O.  and Dines, J.  and Magimai-Doss, M. },
year = {2006},
type = {IDIAP-RR},
number = {62},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {Report_VI, IM2.AP},
abstract = {We propose a generalized dynamic composition algorithm of weighted finite state transducers (WFST), which avoids the creation of non-coaccessible paths, performs weight look-ahead and does not impose any constraints to the topology of the WFSTs. Experimental results on Wall Street Journal (WSJ1) 20k-word trigram task show that at 17\% WER (moderately-wide beam width), the decoding time of the proposed approach is about 48\% and 65\% of the other two dynamic composition approaches. In comparison with static composition, at the same level of 17\% WER, we observe a reduction of about 60\% in memory requirement, with an increase of about 60\% in decoding time due to extra overheads for dynamic composition.},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/cheng-idiap-rr-06-62.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/cheng-idiap-rr-06-62.ps.gz}
}

@techreport{lathoud-rr-06-09,
title = {Unsupervised spectral subtraction for noise-robust asr on unknown transmission channels},
author = {Lathoud, G.  and Magimai-Doss, M.  and Bourlard, H. },
year = {2006},
type = {IDIAP-RR},
number = {09},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
abstract = {This paper addresses several issues of classical spectral subtraction methods with respect to the automatic speech recognition task in noisy environments. The main contributions of this paper are twofold. First, a channel normalization method is proposed to extend spectral subtraction to the case of transmission channels such as cellphones. It equalizes the transmission channel and removes part of the additive noise. Second, a simple, computationally efficient mbox2-component probabilistic model is proposed to discriminate between speech and additive noise at the magnitude spectrogram level. Based on this model, an alternative to classical spectral subtraction is proposed, called Unsupervised Spectral Subtraction'' (USS). The main difference is that the proposed approach does not require any parameter tuning. Experimental studies on Aurora 2 show that channel normalization followed by USS compares advantageously to both classical spectral subtraction, and the ETSI standard front-end (Wiener filtering). Compared to the ETSI standard front-end, a 21.3\% relative improvement is obtained on 0 to 20 dB noise conditions, for an absolute loss of 0.1 \% in clean conditions. The computational cost of the proposed approach is very low, which makes it fit for real-time applications.},
ipdmembership = {speech lathoud mathew bourlard},
language = {English},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/rr-06-09.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/rr-06-09.ps.gz}
}

@book{Popescu-Belis_SPRINGER_2008,
title = {Machine learning for multimodal interaction iv},
author = {Popescu-Belis, A.  and Bourlard, H.  and Renals, S. },
booktitle = {Machine Learning for Multimodal Interaction (4th International Workshop, MLMI 2007, Brno, Czech Republic, June 28-30, 2007, Revised Selected Papers)},
series = {LNCS},
year = {2008},
volume = {4892},
publisher = {Springer-Verlag},
isbn = {978-3-540-78154-7},
note = {http://www.springeronline.com/978-3-540-78154-7},
keywords = {IM2.MPR, Report_VII},
abstract = {This book constitutes the thoroughly refereed post-proceedings of the 4th International Workshop on Machine Learning for Multimodal Interaction, MLMI 2007, held in Brno, Czech Republic, in June 2007. The 25 revised full papers presented together with 1 invited paper were carefully selected during two rounds of reviewing and revision from 60 workshop presentations. The papers are organized in topical sections on multimodal processing, HCI, user studies and applications, image and video processing, discourse and dialogue processing, speech and audio processing, as well as the PASCAL speech separation challenge.}
}

@inproceedings{orabona:ICML08:2008,
title = {The Projectron: a Bounded Kernel-Based Perceptron},
author = {Orabona, F.  and Keshet, J.  and Caputo, B. },
crossref = {orabona:rr08-30},
booktitle = {Int. Conf. on Machine Learning},
year = {2008},
note = {IDIAP-RR 08-30},
keywords = {IM2.MPR, Report_VIII},
abstract = {We present a discriminative online algorithm with a bounded memory growth, which is based on the kernel-based Perceptron. Generally, the required memory of the kernel-based Perceptron for storing the online hypothesis is not bounded. Previous work has been focused on discarding part of the instances in order to keep the memory bounded. In the proposed algorithm the instances are not discarded, but projected onto the space spanned by the previous online hypothesis. We derive a relative mistake bound and compare our algorithm both analytically and empirically to the state-of-the-art Forgetron algorithm (Dekel et al, 2007). The first variant of our algorithm, called Projectron, outperforms the Forgetron. The second variant, called Projectron  , outperforms even the Perceptron.},
ipdmembership = {vision},
projects = {Idiap},
pdf = {ftp://ftp.idiap.ch/pub/papers/2008/orabona-ICML08-2008.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2008/orabona-ICML08-2008.ps.gz}
}

@techreport{lathoud-rr-06-74,
title = {Observations on multi-band asynchrony in distant speech recordings},
author = {Lathoud, G. },
year = {2006},
type = {IDIAP-RR},
number = {74},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
abstract = {Whenever the speech signal is captured by a microphone distant from the user, the acoustic response of the room introduces significant distortions. To remove these distortions from the signal, solutions exist that greatly improve the ASR performance (what was said?), such as dereverberation or beamforming. It may seem natural to apply those signal-level methods in the context of speaker clustering (who spoke when?) with distant microphones, for example when annotating a meeting recording for enhanced browsing experience. Unfortunately, on a corpus of real meeting recordings, it appeared that neither dereverberation nor beamforming gave any improvement on the speaker clustering task. The present technical report constitutes a first attempt to explain this failure, through a cross-correlation analysis between close-talking and distant microphone signals. The various frequency bands of the speech spectrum appear to become desynchronized when the speaker is 1 or 2 meters away from the microphone. Further directions of research are suggested to model this desynchronization.},
ipdmembership = {speech lathoud},
language = {English},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/rr-06-74.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/rr-06-74.ps.gz}
}

@inproceedings{Vijayasenan_ICASSP-2_2009,
title = {Mutual Information based Channel Selection for Speaker Diarization of Meetings Data},
author = {Vijayasenan, D.  and Valente, F.  and Bourlard, H. },
booktitle = {Proceedings of International conference on acoustics speech and signal processing},
year = {2009},
month = {April},
keywords = {IM2.AP, Report_VIII},
abstract = {This paper aims at investigating the use of Kullback-Leibler (KL) divergence based realignment with application to speaker diarization. The use of KL divergence based realignment operates directly on the speaker posterior distribution estimates and is compared with traditional realignment performed using HMM/GMM system. We hypothesize that using posterior estimates to re-align speaker boundaries is more robust than gaussian mixture models in case of multiple feature streams with different statistical properties. Experiments are run on the NIST RT06 data. These experiments reveal that in case of conventional MFCC features the two approaches yields the same performance while the KL based system outperforms the HMM/GMM re-alignment in case of combination of multiple feature streams (MFCC and TDOA).},
projects = {Idiap,
AMIDA,
IM2}
}

@techreport{lovitt:rr07-03,
title = {Correcting confusion matrices for phone recognizers},
author = {Lovitt, A. },
year = {2007},
type = {IDIAP-COM},
number = {03},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
abstract = {Modern speech recognition has many ways of quantifying the misrecognitions a speech recognizer makes. The errors in modern speech recognition makes extensive use of the Levenshtein algorithm to find the distance between the labeled target and the recognized hypothesis. This algorithm has problems when properly aligning substitution confusions due to the lack of knowledge about the system. This work addresses a shortcoming of the alignment provided by speech recognition analysis systems (HTK specifically) and provides a more applicable algorithm for aligning the hypothesis with the target. This new procedure takes into account the systematic errors the recognizer will make and uses that knowledge to produce correct alignments.},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2007/lovitt-idiap-com-07-03.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2007/lovitt-idiap-com-07-03.ps.gz}
}

@inproceedings{Vijayasenan2008,
title = {Combination of agglomerative and sequential clustering for speaker diarization},
author = {Vijayasenan, D.  and Valente, F.  and Bourlard, H. },
booktitle = {International Conference on Acoustics, Speech and Signal Processing},
year = {2008},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@inproceedings{Motlicek_INTERSPEECH2009-3_2009,
title = {Arithmetic Coding of Sub-Band Residuals in FDLP Speech/Audio Codec},
author = {Motlicek, P.  and Ganapathy, S.  and Hermansky, H. },
booktitle = {10th Annual Conference of the International Speech Communication Association},
year = {2009},
month = {September},
pages = {2591--2594},
publisher = {ISCA 2009},
organization = {ISCA},
location = {Brighton, England},
keywords = {Arithmetic Coding, Audio Coding, Entropy Coding, Frequency Domain Linear Prediction (FDLP), Huffman Coding,
IM2.AP, Report_VIII},
abstract = {A speech/audio codec based on Frequency Domain Linear Prediction (FDLP) exploits auto-regressive modeling to approximate instantaneous energy in critical frequency sub-bands of relatively long input segments. The current version of the FDLP codec operating at 66 kbps has been shown to provide comparable subjective listening quality results to state-of-the-art codecs on similar bit-rates even without employing standard blocks such as entropy coding or simultaneous masking. This paper describes an experimental work to increase compression efficiency of the FDLP codec by employing entropy coding. Unlike conventional Huffman coding employed in current speech/audio coding systems, we describe an efficient way to exploit arithmetic coding to entropy compress quantized spectral magnitudes of the sub-band FDLP residuals. Such an approach provides 11\% ( 3 kbps) bit-rate reduction compared to the Huffman coding algorithm ( 1 kbps).},
projects = {Idiap},
}

@inproceedings{bertolami08shape,
title = {Shape code based lexicon reduction for offline handwriting recognition},
author = {Bertolami, R.  and Gutmann, C.  and Spitz, L.  and Bunke, H. },
booktitle = {Proc. 8th IAPR Int. Workshop on Document Analysis Systems},
year = {2008},
pages = {158--163},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@inproceedings{MMSPL-CONF-20087-008,
title = {A multi-channel objective model for the full-reference assessment of color pictures},
author = {De Simone, F.  and Ansorge, M.  and Ebrahimi, T. },
booktitle = {2nd K-space Jamboree Workshop},
year = {2008},
location = {Paris},
url = {http://infoscience.epfl.ch/getfile.py?recid=125934&mode=best},
keywords = {IM2.MCA, Report_VIII},
abstract = {This paper presents a new approach for the design of a full reference objective quality metric for the assessment of color pictures. Our goal is to build a multi-channel metric based on the perceptual weighting of single-channel metrics. A psycho-visual experiment is thus designed in order to determine the values of the weighting factors. This metric is expected to provide a new useful tool for the quality assessment of compressed pictures in the framework of codec performance evaluation.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/125934},
oai-id = {oai:infoscience.epfl.ch:125934},
oai-set = {conf; fulltext; fulltext-public},
unit = {MMSPL}
}

@inproceedings{AMR2007Kludas,
title = {Information fusion in multimedia information retrieval},
author = {Kludas, J.  and Bruno, E.  and Marchand-Maillet, S. },
booktitle = {Workshop on Adaptive Multimedia Retrieval (AMR 2007)},
year = {2007},
keywords = {Report_VI, IM2.MCA}
}

@article{Bresson2005_1439/LTS,
title = {Fast Global Minimization of the Active Contour/Snake Model},
author = {Bresson, X.  and Esedoglu, S.  and Vandergheynst, P.  and Thiran, J. -Ph.  and Osher, S. },
journal = {Journal of Mathematical Imaging and Vision},
year = {2007},
volume = {28},
number = {2},
pages = {151--167},
url = {http://infoscience.epfl.ch/getfile.py?recid=87328&mode=best},
doi = {na},
keywords = {Report_VII, IM2.VP, LTS2; LTS5},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/87328},
oai-id = {oai:infoscience.epfl.ch:87328},
oai-set = {article; fulltext},
status = {PUBLISHED},
unit = {LTS}
}

@inproceedings{apb-mteval-lrec2008,
title = {Improving contextual quality models for mt evaluation based on evaluators' feedback.},
author = {Estrella, P.  and Popescu-Belis, A.  and King, M. },
booktitle = {LREC 2008 (6th International Conference on Language Resources and Evaluation)},
year = {2008},
keywords = {Report_VII, IM2.DMA},
abstract = {The Framework for Machine Translation Evaluation (FEMTI), introduced by the ISLE Evaluation Working Group, contains guidelines for defining a quality model used to evaluate an MT system, in relation to the purpose and context of use of the system. In this paper, we report results from a recent experiment aimed at transferring knowledge from MT evaluation experts into the FEMTI guidelines, in particular, to populate relations denoting the influence of the context of use of a system on its evaluation. The results of this hands-on exercise carried out as part of a tutorial, are publicly available at http://www.issco.unige.ch/femti/.}
}

@inproceedings{eth_biwi_00452,
title = {Segmentation based multi-cue integration for object detection},
author = {Leibe, B.  and Mikolajczyk, K.  and Schiele, B. },
booktitle = {British Machine Vision Conference (BMVC},
year = {2006},
keywords = {Report_VI, IM2.VP}
}

@inproceedings{eth_biwi_00451,
title = {Efficient clustering and matching for object class recognition},
author = {Leibe, B.  and Mikolajczyk, K.  and Schiele, B. },
booktitle = {British Machine Vision Conference (BMVC},
year = {2006},
keywords = {Report_VI, IM2.VP}
}

@inproceedings{eth_biwi_00457,
title = {Privacy in video surveilled areas},
author = {Spindler, T.  and Wartmann, C.  and Roth, D.  and Steffen, A.  and Hovestadt, L.  and van Gool, L. },
booktitle = {International Conference on Privacy, Security and Trust (PST 2006)},
year = {2006},
keywords = {Report_VI, IM2.VP, major publication, Best Paper Awards, Surveillance, Cryptography, Computer Vision, Tracking, Building Automation}
}

@inproceedings{apb-bet4tqb-lrec2008,
title = {Task-based evaluation of meeting browsers: from bet task elicitation to user behavior analysis},
author = {Popescu-Belis, A.  and Flynn, M.  and Wellner, P.  and Baudrion, P. },
booktitle = {LREC 2008 (6th International Conference on Language Resources and Evaluation)},
year = {2008},
keywords = {Report_VII, IM2.HMI},
abstract = {This paper presents recent results of the application of the task-based Browser Evaluation Test (BET) to meeting browsers, that is, interfaces to multimodal databases of meeting recordings. The tasks were defined by browser-neutral BET observers. Two groups of human subjects used the Transcript-based Query and Browsing interface (TQB), and attempted to solve as many BET tasks a pairs of true/false statements to disambiguate a as possible in a fixed amount of time. Their performance was measured in terms of precision and speed. Results indicate that the browser as annotation-based search functionality is frequently used, in particular the keyword search. A more detailed analysis of each test question for each participant confirms that despite considerable variation across strategies, the use of queries is correlated to successful performance.}
}

@inproceedings{eth_biwi_00513,
title = {Multi-activity tracking in lle body pose space},
author = {Jaeggli, T.  and Koller-Meier, E.  and van Gool, L. },
booktitle = {2nd Workshop on HUMAN MOTION Understanding, Modeling, Capture and Animation, ICCV},
year = {2007},
keywords = {Report_VII, IM2.VP}
}

@article{eth_biwi_00516,
title = {A model-selection framework for multibody structure-and-motion of image sequences},
author = {Schindler, K.  and Suter, D.  and {H. Wang} },
journal = {International Journal of Computer Vision},
year = {2007},
volume = {79},
number = {2},
pages = {159--177},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{valente:Interspeech:2008,
title = {Integration of TDOA Features in Information Bottleneck Framework for Fast Speaker Diarization},
author = {Vijayasenan, D.  and Valente, F.  and Bourlard, H. },
crossref = {valente:rr08-26},
booktitle = {Interspeech 2008},
year = {2008},
note = {IDIAP-RR 08-26},
keywords = {IM2.AP, Report_VIII},
abstract = {In this paper we address the combination of multiple feature streams in a fast speaker diarization system for meeting recordings. Whenever Multiple Distant Microphones (MDM) are used, it is possible to estimate the Time Delay of Arrival (TDOA) for different channels. In citexavi_comb},
ipdmembership = {speech},
projects = {Idiap},
pdf = {ftp://ftp.idiap.ch/pub/papers/2008/valente-Interspeech-2008.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2008/valente-Interspeech-2008.ps.gz}
}

@inproceedings{eth_biwi_00514,
title = {Learning generative models for monocular body pose estimation},
author = {Jaeggli, T.  and Koller-Meier, E.  and van Gool, L. },
booktitle = {ACCV},
year = {2007},
keywords = {Report_VII, IM2.VP}
}

@article{eth_biwi_00515,
title = {Learning semantic object parts for object categorization},
author = {Leibe, B.  and Ettlin, A.  and Schiele, B. },
journal = {Image and Vision Computing},
year = {2008},
volume = {26},
number = {1},
pages = {15--26},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{valente:Interspeech:2007,
title = {Hierarchical neural networks feature extraction for lvcsr system},
author = {Valente, F.  and Vepa, J.  and Plahl, C.  and Gollan, C.  and Hermansky, H.  and Schlüter, R. },
booktitle = {Interspeech 2007},
year = {2007},
note = {IDIAP-RR 07-08},
keywords = {Report_VI, IM2.AP},
abstract = {This paper investigates the use of a hierarchy of Neural Networks for performing data driven feature extraction. Two different hierarchical structures based on long and short temporal context are considered. Features are tested on two different LVCSR systems for Meetings data (RT05 evaluation data) and for Arabic Broadcast News (BNAT05 evaluation data). The hierarchical NNs outperforms the single NN features consistently on different type of data and tasks and provides significant improvements w.r.t. respective baselines systems. Best result is obtained when different time resolutions are used at different level of the hierarchy.},
ipdmembership = {speech},
ipdxref = {techreport:valente-idiap-rr-07-08.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/valente-Interspeech-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/valente-Interspeech-2007.ps.gz}
}

@inproceedings{Dines_INTERSPEECH-2_2009,
title = {Speech recognition with speech synthesis models by marginalising over decision tree leaves},
author = {Dines, J.  and Saheer, L.  and Liang, H. },
crossref = {Dines_Idiap-RR-17-2009},
booktitle = {Proceedings of Interspeech},
year = {2009},
location = {Brighton, U.K.},
keywords = {decision trees, speech recognition, speech synthesis, unified models, IM2.AP, Report_VIII},
abstract = {There has been increasing interest in the use of unsupervised adaptation for the personalisation of text-to-speech (TTS) voices, particularly in the context of speech-to-speech translation. This requires that we are able to generate adaptation transforms from the output of an automatic speech recognition (ASR) system. An approach that utilises unified ASR and TTS models would seem to offer an ideal mechanism for the application of unsupervised adaptation to TTS since transforms could be shared between ASR and TTS. Such unified models should use a common set of parameters. A major barrier to such parameter sharing is the use of differing contexts in ASR and TTS. In this paper we propose a simple approach that generates ASR models from a trained set of TTS models by marginalising over the TTS contexts that are not used by ASR. We present preliminary results of our proposed method on a large vocabulary speech recognition task and provide insights into future directions of this work.},
projects = {EMIME},
}

@inproceedings{ferrez_graz_2008,
title = {Simultaneous real-time detection of motor imagery and error-related potentials for improved bci accuracy},
author = {Ferrez, P. W.  and Millán, J. del R. },
booktitle = {Proceedings of the 4th International Brain-Computer Interface Workshop and Training Course},
year = {2008},
keywords = {IM2.BCI, Report_VII},
abstract = {Brain-computer interfaces (BCIs), as any other interaction modality based on physiological signals and body channels (e.g., muscular activity, speech and gestures), are prone to errors in the recognition of subject's intent. An elegant approach to improve the accuracy of BCIs consists of a verification procedure directly based on the presence of error-related potentials (ErrP) in the EEG recorded right after the occurrence of an error. Two healthy volunteer subjects with little prior BCI experience participated in a real-time human-robot interaction experiment where they were asked to mentally move a cursor towards a target that can be reached within a few steps using motor imagery. These experiments confirm the previously reported presence of a new kind of ErrP. These Interaction ErrP exhibit a first sharp negative peak followed by a positive peak and a second broader negative peak ( 270, 330 and 430 ms after the feedback, respectively). The objective of the present study was to simultaneously detect erroneous responses of the interface and classifying motor imagery at the level of single trials in a real-time system. We have achieved online an average recognition rate of correct and erroneous single trials of 84.7\% and 78.8\%, respectively. The off-line post-analysis showed that the BCI error rate without the integration of ErrP detection is around 30\% for both subjects. However, when integrating ErrP detection, the average online error rate drops to 7\%, multiplying the bit rate by more than 3. These results show that it's possible to simultaneously extract in real-time useful information for mental control to operate a brain-actuated device as well as correlates of cognitive states such as error-related potentials to improve the quality of the brain-computer interaction.}
}

@inproceedings{soleymani2008:riederalp,
title = {Estimating emotions and tracking interest during movie watching based on multimedia content and physiological responses},
author = {Soleymani, M.  and Kierkels, J.  and Chanel, G.  and Bruno, E.  and Marchand-Maillet, S.  and {T. Pun} },
booktitle = {Joint (IM)2-Interactive Multimodal Information Management and Affective Sciences NCCRs meeting},
year = {2008},
keywords = {Report_VII, IM2.MCA}
}

@article{quelhas:pami:2007,
title = {A thousand words in a scene},
author = {Quelhas, P.  and Odobez, J. -M.  and Gatica-Perez, D.  and Tuytelaars, T. },
crossref = {quelhas:rr05-40},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
year = {2007},
volume = {29},
number = {9},
pages = {151575--1589},
note = {IDIAP-RR 05-40},
doi = {10.1109/tpami.2007.1155},
keywords = {IM2.VP, Report_VII},
abstract = {This paper presents a novel approach for visual scene modeling and classification, investigating the combined use of text modeling methods and local invariant features. Our work attempts to eluciyear(1) whether a text-like emphbag-of-visterms representation (histogram of quantized local visual features) is suitable for scene (rather than object) classification, (2) whether some analogies between discrete scene representations and text documents exist, and (3) whether unsupervised, latent space models can be used both as feature extractors for the classification task and to discover patterns of visual co-occurrence. Using several data sets, we valiyearour approach, presenting and discussing experiments on each of these issues. We first show, with extensive experiments on binary and multi-class scene classification tasks using a 9500-image data set, that the emphbag-of-visterms representation consistently outperforms classical scene classification approaches. In other data sets we show that our approach competes with or outperforms other recent, more complex, methods. We also show that Probabilistic Latent Semantic Analysis (PLSA) generates a compact scene representation, discriminative for accurate classification, and more robust than the emphbag-of-visterms representation when less labeled training data is available. Finally, through aspect-based image ranking experiments, we show the ability of PLSA to automatically extract visually meaningful scene patterns, making such representation useful for browsing image collections.}
}

@inproceedings{tsamuel:mlmi:2008,
title = {Hilbert envelope based features for far-field speech recognition},
author = {Thomas, A.  and Ganapathy, S.  and Hermansky, H. },
crossref = {tsamuel:rr08-42},
booktitle = {MLMI 2008},
year = {2008},
publisher = {Utrecht, The Netherlands},
note = {IDIAP-RR 08-42},
keywords = {IM2.AP, Report_VII},
abstract = {Automatic speech recognition (ASR) systems, trained on speech signals from close-talking microphones, generally fail in recognizing far-field speech. In this paper, we present a Hilbert Envelope based feature extraction technique to alleviate the artifacts introduced by room reverberations. The proposed technique is based on modeling temporal envelopes of the speech signal in narrow sub-bands using Frequency Domain Linear Prediction (FDLP). ASR experiments on far-field speech using the proposed FDLP features show significant performance improvements when compared to other robust feature extraction techniques (average relative improvement of $43 \%$ in word error rate).}
}

@article{fleuret-geman-2008,
title = {Stationary features and cat detection},
author = {Fleuret, F.  and Geman, D. },
journal = {Journal of Machine Learning Research (JMLR)},
year = {2008},
volume = {9},
pages = {2549--2578},
keywords = {IM2.VP, Report_VIII}
}

@article{plauche07,
title = {How to build a spoken dialog system with limited (or no) resources},
author = {Plauch\'e, M.  and Cetin, O.  and Uhdaykumar, N. },
journal = {AI in ICT for Development Workshop of the Twentieth Intl. Joint Conf. on AI, Hyderabad, India},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{Friedland15,
title = {Two's a crowd: improving speaker diarization by automatically identifying and excluding overlapped speech},
author = {Boakye, K.  and Vinyals, O.  and Friedland, G. },
booktitle = {Interspeech 2008, Brisbane, Australia},
year = {2008},
pages = {32--35},
keywords = {IM2.AP, Report_VIII}
}

@techreport{Richiardi2006ABIDphaseIIreport,
title = {Applying biometrics to identity documents: estimating and coping with errors},
author = {Richiardi, J.  and Drygajlo, A. },
year = {2006},
type = {SNSF AMBAI project technical report},
institution = {Swiss Federal Institute of Technology},
keywords = {Report_VI, IM2.MPR},
owner = {Jori}
}

@inproceedings{Friedland14,
title = {Modulation spectrogram features for speaker diarization},
author = {Vinyals, O.  and Friedland, G. },
booktitle = {Interspeech 2008, Brisbane, Australia},
year = {2008},
pages = {630--633},
keywords = {IM2.AP, Report_VIII}
}

@inproceedings{Friedland10,
title = {A hardware-independent fast logarithm approximation with adjustable accuracy},
author = {Vinyals, O.  and Friedland, G. },
booktitle = {10th IEEE International Symposium on Multimedia, Berkeley, CA, USA},
year = {2008},
pages = {61--65},
keywords = {IM2.AP, Report_VIII}
}

@inproceedings{Friedland13,
title = {Towards audio-visual on-line diarization of participants in group meetings},
author = {Hung, H.  and Friedland, G. },
booktitle = {European Conference on Computer Vision (ECCV) 2008, Marseille, France},
year = {2008},
keywords = {IM2.AP, Report_VIII}
}

@inproceedings{Friedland12,
title = {Live speaker identification in conversations},
author = {Friedland, G.  and Vinyals, O. },
booktitle = {ACM Multimedia 2008, Vancouver, Canada},
year = {2008},
pages = {1017--1018},
keywords = {IM2.AP, Report_VIII}
}

@techreport{kumatani:rr08-07,
title = {Maximum negentropy beamforming},
author = {Kumatani, K.  and McDonough, J.  and Klakow, D.  and Garner, P. N.  and Li, W. },
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-07-2008},
institution = {IDIAP},
keywords = {Report_VII, IM2.AP},
ipdmembership = {speech},
projects = {Idiap},
pdf = {ftp://ftp.idiap.ch/pub/reports/2008/kumatani-idiap-rr-08-07.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2008/kumatani-idiap-rr-08-07.ps.gz}
}

@inproceedings{Anemueller_ICMI2008_2008,
title = {The DIRAC AWEAR Audio-Visual Platform for Detection of Unexpected and Incongruent Events},
author = {Anemuller, J.  and Back, J. -H.  and Caputo, B.  and Havlena, M.  and Luo, J.  and Kayser, H.  and Leibe, B.  and Motlicek, P.  and Pajdla, T.  and Pavel, M.  and Torii, A.  and van Gool, L.  and Zweig, A.  and Hermansky, H. },
booktitle = {Proceedings of the International Conference on Multimodal Interfaces},
year = {2008},
keywords = {IM2.DMA, Report_VIII},
abstract = {It is of prime importance in everyday human life to cope with and respond appropriately to events that are not foreseen by prior experience. Machines to a large extent lack the ability to respond appropriately to such inputs. An important class of unexpected events is defined by incongruent combinations of inputs from different modalities and therefore multimodal information provides a crucial cue for the identification of such events, e.g., the sound of a voice is being heard while the person in the fieldof- view does not move her lips. In the project DIRAC (''Detection and Identification of Rare Audio-visual Cues'') we have been developing algorithmic approaches to the detection of such events, as well as an experimental hardware platform to test it. An audio-visual platform (''AWEAR'' - audio-visual wearable device) has been constructed with the goal to help users with disabilities or a high cognitive load to deal with unexpected events. Key hardware components include stereo panoramic vision sensors and 6-channel worn-behind-the-ear (hearing aid) microphone arrays. Data have been recorded to study audio-visual tracking, a/v scene/object classification and a/v detection of incongruencies.},
projects = {Idiap,
DIRAC},
}

@techreport{kumatani:rr08-02,
title = {Filter Bank Design for Subband Adaptive Beamforming and Application to Speech Recognition},
author = {Kumatani, K.  and McDonough, J.  and Schacht, S.  and Klakow, D.  and Garner, P. N.  and Li, W. },
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-02-2008},
institution = {IDIAP},
keywords = {IM2.AP, Report_VIII},
abstract = {beginabstract We present a new filter bank design method for subband adaptive beamforming. Filter bank design for adaptive filtering poses many problems not encountered in more traditional applications such as subband coding of speech or music. The popular class of perfect reconstruction filter banks is not well-suited for applications involving adaptive filtering because perfect reconstruction is achieved through alias cancellation, which functions correctly only if the outputs of individual subbands are emphnot subject to arbitrary magnitude scaling and phase shifts. In this work, we design analysis and synthesis prototypes for modulated filter banks so as to minimize each aliasing term individually. We then show that the emphtotal response error can be driven to zero by constraining the analysis and synthesis prototypes to be emphNyquist($M$) filters. We show that the proposed filter banks are more robust for aliasing caused by adaptive beamforming than conventional methods. Furthermore, we demonstrate the effectiveness of our design technique through a set of automatic speech recognition experiments on the multi-channel, far-field speech data from the emphPASCAL Speech Separation Challenge. In our system, speech signals are first transformed into the subband domain with the proposed filter banks, and thereafter the subband components are processed with a beamforming algorithm. Following beamforming, post-filtering and binary masking are performed to further enhance the speech by removing residual noise and undesired speech. The experimental results prove that our beamforming system with the proposed filter banks achieves the best recognition performance, a 39.6\% word error rate (WER), with half the amount of computation of that of the conventional filter banks while the perfect reconstruction filter banks provided a 44.4\% WER. endabstract},
ipdmembership = {speech},
projects = {Idiap},
pdf = {ftp://ftp.idiap.ch/pub/reports/2008/kumatani-idiap-rr-08-02.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2008/kumatani-idiap-rr-08-02.ps.gz}
}

@inproceedings{Gonzalez_ECCV_2008,
title = {Automated delineation of dendritic networks in noisy image stacks},
author = {Gonzalez, G.  and Fleuret, F.  and Fua, P. },
booktitle = {The 10th European Conference on Computer Vision},
year = {2008},
location = {Marseille, France},
keywords = {IM2.VP, Report_VII}
}

@inproceedings{chavarriaga:sc:2008,
title = {Analyzing interactions between navigation strategies using a computational model of action selection},
author = {Doll\'e, L.  and Khamassi, M.  and Girard, B.  and Guillot, A.  and Chavarriaga, R. },
crossref = {chavarriaga:rr08-48},
booktitle = {Spatial Cognition 2008 (SC '08)},
series = {Lecture Notes in Computer Science},
year = {2008},
pages = {71--86},
location = {Freiburg, Germany},
note = {IDIAP-RR 08-48},
keywords = {IM2.MCA, Report_VII},
abstract = {For animals as well as for humans, the hypothesis of multiple memory systems involved in different navigation strategies is supported by several biological experiments. However, due to technical limitations, it remains difficult for experimentalists to eluciyearhow these neural systems interact. We present how a computational model of selection between navigation strategies can be used to analyse phenomena that cannot be directly observed in biological experiments. We reproduce an experiment where the rat's behaviour is assumed to be ruled by two different navigation strategies (a cue-guided and a map-based one). Using a modelling approach, we can explain the experimental results in terms of interactions between these systems, either competing or cooperating at specific moments of the experiment. Modelling such systems can help biological investigations to explain and predict the animal behaviour.}
}

@article{MTAP2006,
title = {Handling temporal heterogeneous data for content-based management of large video collections},
author = {Moüenne-Loccoz, N.  and Janvier, B.  and Marchand-Maillet, S.  and Bruno, E. },
journal = {Multimedia Tools and Applications},
year = {2006},
volume = {31},
pages = {309--325},
keywords = {Report_VI, IM2.MCA}
}

@techreport{vinciarelli:rr07-33,
title = {Mapping nonverbal communication into social status: automatic recognition of journalists and non-journalists in radio news},
author = {Vinciarelli, A. },
year = {2007},
type = {IDIAP-RR},
number = {33},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {Report_VI, IM2.MCA.MPR, joint publication},
abstract = {This work shows how features accounting for nonverbal speaking characteristics can be used to map people into predefined categories. In particular, the results of this paper show that the speakers participating in radio broadcast news can be classified into journalists and non-journalists with an accuracy higher than 80 percent. The results of the approach proposed for this task is compared with the effectiveness of 16 human assessors performing the same task. The assessors do not understand the language of the data and are thus forced to use mostly nonverbal features. The results of the comparison suggest that the assessors and the automatic system have a similar performance.},
ipdmembership = {vision},
pdf = {ftp://ftp.idiap.ch/pub/reports/2007/vinciarelli-idiap-rr-07-33.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2007/vinciarelli-idiap-rr-07-33.ps.gz}
}

@incollection{popescubelis-mlmi08,
title = {The amida automatic content linking device: just-in-time document retrieval in meetings},
author = {Popescu-Belis, A.  and Boertjes, E.  and Kilgour, J.  and Poller, P.  and Castronovo, S.  and Wilson, T.  and Jaimes, A.  and Carletta, J. },
editor = {Popescu-Belis, A.  and Stiefelhagen, R. },
booktitle = {Machine Learning for Multimodal Interaction V (Proceedings of MLMI 2008, Utrecht, 8-10 September 2008)},
series = {LNCS 5237},
year = {2008},
pages = {273--284},
publisher = {Springer-Verlag},
keywords = {Report_VII, IM2.HMI}
}

@inproceedings{schlapbach08automatic,
title = {Automatic estimation of the readability of handwritten text},
author = {Schlapbach, A.  and Wettstein, F.  and Bunke, H. },
booktitle = {Proc. 16th European Signal Processing Conference},
year = {2008},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@incollection{popescubelis-mlmi07,
title = {Towards an objective test for meeting browsers: the bet4tqb pilot experiment},
author = {Popescu-Belis, A.  and Baudrion, P.  and Flynn, M.  and Wellner, P. },
editor = {Popescu-Belis, A.  and Bourlard, H.  and Renals, S. },
booktitle = {Machine Learning for Multimodal Interaction IV},
series = {LNCS 4892},
year = {2008},
pages = {108--119},
publisher = {Springer-Verlag},
doi = {10.1007/978-3-540-78155-4_10},
keywords = {Report_VII, IM2.HMI}
}

@inproceedings{Voloshynovskiy:ACM2006,
title = {On reversibility of random binning based data-hiding techniques: security perspectives},
author = {Voloshynovskiy, S.  and Koval, O.  and Topak, E.  and Forcen, J. E. V.  and Pun, T. },
booktitle = {ACM Multimedia and Security Workshop 2006},
year = {2006},
url = {http://vision.unige.ch/publications/postscript/2006/mmsec167-topak.ps},
keywords = {Report_VI, IM2.MPR},
vgclass = {refpap},
vgproject = {watermarking}
}

@inproceedings{valente:ICASSP:2007,
title = {Combination of acoustic classifiers based on dempster-shafer theory of evidence},
author = {Valente, F.  and Hermansky, H. },
booktitle = {IEEE Int. Conf. on Acoustics, Speech, and Signal Processing (ICASSP)},
year = {2007},
note = {IDIAP-RR 06-61},
keywords = {Report_VI, IM2.AP},
abstract = {In this paper we investigate combination of neural net based classifiers using Dempster-Shafer Theory of Evidence. Under some assumptions, combination rule resembles a product of errors rule observed in human speech perception. Different combination are tested in ASR experiments both in matched and mismatched conditions and compared with more conventional probability combination rules. Proposed techniques are particularly effective in mismatched conditions.},
ipdmembership = {speech},
ipdxref = {techreport:valente-idiap-rr-06-61.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/valente-ICASSP-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/valente-ICASSP-2007.ps.gz}
}

@inproceedings{Tsourakis:SIMPE:2008,
title = {From desktop to mobile: adapting a successful voice interaction platform for use in mobile devices},
author = {Tsourakis, N.  and Lisowska, A.  and Bouillon, P.  and Rayner, M. },
booktitle = {Third ACM MobileHCI Workshop on Speech in Mobile and Pervasive Environments (SiMPE)},
year = {2008},
location = {Amsterdam, the Netherlands.},
note = {2nd-5th, September},
keywords = {IM2.HMI Report_VII}
}

@article{kludas2008:mtap,
title = {Can feature information interaction help for information fusion in multimedia problems?},
author = {Kludas, J.  and Bruno, E.  and Marchand-Maillet, S. },
journal = {To appear in Multimedia Tools and Applications Journal special issue on "Metadata Mining for Image Understanding"},
year = {2008},
keywords = {Report_VII, IM2.MCA}
}

@article{schlapbach07writer,
title = {A writer identification and verification system using HMM based recognizers},
author = {Schlapbach, A.  and Bunke, H. },
journal = {Pattern Analysis and Applications},
year = {2007},
volume = {10},
number = {1},
pages = {33--43},
publisher = {Springer},
keywords = {Report_VI, IM2.VP},
peer = {yes}
}

@techreport{Ganapathy_Idiap-RR-75-2008,
title = {Low-Delay Error Resilient Speech Coding Using Sub-band Hilbert Envelopes},
author = {Ganapathy, S.  and Motlicek, P.  and Hermansky, H. },
year = {2008},
month = {September},
type = {Idiap-RR},
number = {Idiap-RR-75-2008},
institution = {Idiap},
keywords = {IM2.AP,Report_VIII},
projects = {Idiap,
IM2},
}

@article{Dumas20084,
title = {Prototyping multimodal interfaces with smuiml modeling language},
author = {Dumas, B.  and Lalanne, D.  and Ingold, R. },
year = {2008},
pages = {63--66},
keywords = {Report_VII, IM2.HMI}
}

@inproceedings{bourlard:langtech:2008,
title = {Recognition and understanding of meetings overview of the european ami and amida projects},
author = {Bourlard, H.  and Renals, S. },
crossref = {bourlard:rr08-27},
booktitle = {LangTech 2008},
year = {2008},
location = {Rome},
note = {IDIAP-RR 08-27},
keywords = {IM2.AP, Report_VII},
abstract = {The AMI and AMIDA projects are concerned with the recognition and interpretation of multiparty (face-to-face and remote) meetings. Within these projects we have developed the following: (1) an infrastructure for recording meetings using multiple microphones and cameras; (2) a one hundred hour, manually annotated meeting corpus; (3) a number of techniques for indexing, and summarizing of meeting videos using automatic speech recognition and computer vision, and (4) a extensible framework for browsing, and searching of meeting videos. We give an overview of the various techniques developed in AMI (mainly involving face-to-face meetings), their integration into our meeting browser framework, and future plans for AMIDA (Augmented Multiparty Interaction with Distant Access), the follow-up project to AMI. Technical and business information related to these two projects can be found at www.amiproject.org, respectively on the Scientific and Business portals.}
}

@article{Dumas20082,
title = {Strengths and weaknesses of software architectures for the rapid creation of tangible and multimodal interfaces},
author = {Dumas, B.  and Lalanne, D.  and Guinard, D.  and Koenig, R.  and Ingold, R. },
year = {2008},
pages = {47--54},
keywords = {Report_VII, IM2.HMI}
}

@inproceedings{valente:ICASSP:2008,
title = {Hierarchical and parallel processing of modulation spectrum for asr applications},
author = {Valente, F.  and Hermansky, H. },
crossref = {valente:rr07-45},
booktitle = {IEEE Int. Conf. on Acoustics, Speech, and Signal Processing (ICASSP)},
year = {2008},
pages = {4165--4168},
isbn = {978-1-4244-1483-3},
issn = {1520-6149},
note = {IDIAP-RR 07-45},
doi = {10.1109/icassp.2008.4518572},
keywords = {IM2.AP, Report_VII},
abstract = {The modulation spectrum is an efficient representation for describing dynamic information in signals. In this work we investigate how to exploit different elements of the modulation spectrum for extraction of information in automatic recognition of speech (ASR). Parallel and hierarchical (sequential) approaches are investigated. Parallel processing combines outputs of independent classifiers applied to different modulation frequency channels. Hierarchical processing uses different modulation frequency channels sequentially. Experiments are run on a LVCSR task for meetings transcription and results are reported on the RT05 evaluation data. Processing modulation frequencies channels with different classifiers provides a consistent reduction in WER (2\% absolute w.r.t. PLP baseline). Hierarchical processing outperforms parallel processing. The largest WER reduction is obtained trough sequential processing moving from high to low modulation frequencies. This model is consistent with several perceptual and physiological studies on auditory processing.}
}

@inproceedings{farrahi:acmmm:2008,
title = {What did you do today? discovering daily routines from large-scale mobile data},
author = {Gatica-Perez, D.  and Farrahi, K. },
crossref = {farrahi:rr08-49},
booktitle = {ACM International Conference on Multimedia (ACMMM)},
year = {2008},
location = {Vancouver},
note = {IDIAP-RR 08-49},
keywords = {IM2.MCA, Report_VII},
abstract = {We present a framework built from two Hierarchical Bayesian topic models to discover human location-driven routines from mobile phones. The framework uses location-driven bag representations of people's daily activities obtained from celltower connections. Using 68 000 hours of real-life human data from the Reality Mining dataset, we successfully discover various types of routines. The first studied model, Latent Dirichlet Allocation (LDA), automatically discovers characteristic routines for all individuals in the study, including going to work at 10am", leaving work at night", or staying home for the entire evening". In contrast, the second methodology with the Author Topic model (ATM) finds routines characteristic of a selected groups of users, such as being at home in the mornings and evenings while being out in the afternoon", and ranks users by their probability of conforming to certain daily routines.}
}

@article{KokFro-SPL.07,
title = {Accelarating distributed consensus using extrapolation},
author = {Kokiopoulou, E.  and Frossard, P. },
journal = {IEEE Signal Processing Letters},
year = {2007},
volume = {14},
number = {10},
pages = {665--668},
keywords = {Report_VII, IM2.DMA.VP, joint publication}
}

@inproceedings{GaticaReview09,
title = {Automatic nonverbal analysis of social interaction in small groups: a review},
author = {Gatica-Perez, D. },
booktitle = {Image and Vision Computing, Special Issue on Human Naturalistic Behavior, in press},
year = {2009},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{pinto:ICSLP:2007,
title = {Exploiting phoneme similarities in hybrid hmm-ann keyword spotting},
author = {Pinto, J. P.  and Lovitt, A.  and Hermansky, H. },
booktitle = {Proceedings of Interspeech},
year = {2007},
note = {IDIAP-RR 07-11},
keywords = {Report_VI, IM2.AP},
abstract = {We propose a technique for generating alternative models for keywords in a hybrid hidden Markov model - artificial neural network (HMM-ANN) keyword spotting paradigm. Given a base pronunciation for a keyword from the lookup dictionary, our algorithm generates a new model for a keyword which takes into account the systematic errors made by the neural network and avoiding those models that can be confused with other words in the language. The new keyword model improves the keyword detection rate while minimally increasing the number of false alarms.},
ipdmembership = {speech},
ipdxref = {techreport:pinto-idiap-rr-07-11.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/pinto-ICSLP-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/pinto-ICSLP-2007.ps.gz}
}

@inproceedings{humm07:icb,
title = {Modelling combined handwriting and speech modalities},
author = {Humm, A.  and Hennebert, J.  and Ingold, R. },
booktitle = {Accepted for publication, International Conference on Biometrics (ICB 2007), Seoul Korea},
year = {2007},
keywords = {Report_VI, IM2.MPR}
}

@book{mlmi2008,
title = {Machine learning for multimodal interaction v (proceedings of mlmi 2008, utrecht, 8-10 september 2008)},
author = {Popescu-Belis, A.  and Stiefelhagen, R. },
editor = {Popescu-Belis, A.  and Stiefelhagen, R. },
series = {LNCS 5237},
year = {2008},
publisher = {Springer-Verlag},
keywords = {Report_VII, IM2.DMA}
}

@book{mlmi2007,
title = {Machine learning for multimodal interaction iv (revised selected papers from mlmi 2007, brno, 28-30 june 2007)},
author = {Popescu-Belis, A.  and Bourlard, H.  and Renals, S. },
editor = {Popescu-Belis, A.  and Bourlard, H.  and Renals, S. },
series = {LNCS 4892},
year = {2008},
publisher = {Springer-Verlag},
keywords = {Report_VII, IM2.DMA}
}

@inproceedings{Hung_icassp08,
title = {Estimating the dominant person in multi-party conversations using speaker diarization strategies},
author = {Hung, H.  and Huang, Y.  and Friedland, G.  and Gatica-Perez, D. },
booktitle = {ICASSP 08},
year = {2008},
keywords = {Report_VII, IM2.MPR}
}

@inproceedings{quack07iccv,
title = {Efficient mining of frequent and distinctive feature configurations},
author = {Quack, T.  and Ferrari, V.  and Leibe, B.  and van Gool, L. },
booktitle = {accepted for ICCV'07},
year = {2007},
keywords = {Report_VI, IM2.ISD, IM2.MCA, joint publication}
}

@article{KokFroGko-ISIT.08,
title = {Optimal polynomial filtering for accelerating distributed consensus},
author = {Kokiopoulou, E.  and Frossard, P.  and Gkorou, D. },
journal = {IEEE Int. Symp. on Information Theory (ISIT)},
year = {2008},
keywords = {Report_VII, IM2.DMA.VP, joint}
}

@article{pop09-vincia,
title = {Multimedia meeting processing and retrieval at the idiap research institute},
author = {Popescu-Belis, A.  and Vinciarelli, A. },
journal = {Informer (Newsletter of the BCS Information Retrieval Specialist Group)},
year = {2009},
volume = {29},
pages = {14--16},
keywords = {IM2.DMA, Report_VIII}
}

@inproceedings{Popescu-Belis_MLMI_2008,
title = {The AMIDA Automatic Content Linking Device: Just-in-Time Document Retrieval in Meetings},
author = {Popescu-Belis, A.  and Boertjes, E.  and Kilgour, J.  and Poller, P.  and Castronovo, S.  and Wilson, T.  and Jaimes, A.  and Carletta, J. },
editor = {Popescu-Belis, A.  and Stiefelhagen, R. },
booktitle = {Machine Learning for Multimodal Interaction V},
series = {LNCS},
year = {2008},
volume = {5237},
pages = {272--283},
publisher = {Springer-Verlag},
location = {Utrecht},
doi = {10.1007/978-3-540-85853-9_25},
keywords = {IM2.MCA, IM2.HMI, Report_VIII},
abstract = {The AMIDA Automatic Content Linking Device (ACLD) is a just-in-time document retrieval system for meeting environments. The ACLD listens to a meeting and displays information about the documents from the group's history that are most relevant to what is being said. Participants can view an outline or the entire content of the documents, if they feel that these documents are potentially useful at that moment of the meeting. The ACLD proof-of-concept prototype places meeting-related documents and segments of previously recorded meetings in a repository and indexes them. During a meeting, the ACLD continually retrieves the documents that are most relevant to keywords found automatically using the current meeting speech. The current prototype simulates the real-time speech recognition that will be available in the near future. The software components required to achieve these functions communicate using the Hub, a client/server architecture for annotation exchange and storage in real-time. Results and feedback for the first ACLD prototype are outlined, together with plans for its future development within the AMIDA EU integrated project. Potential users of the ACLD supported the overall concept, and provided feedback to improve the user interface and to access documents beyond the group's own history.},
projects = {Idiap,
AMIDA},
}

@techreport{Tommasi_Idiap-RR-77-2008,
title = {CLEF2008 Image Annotation Task: an SVM Confidence-Based Approach},
author = {Tommasi, T.  and Orabona, F.  and Caputo, B. },
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-77-2008},
institution = {Idiap},
note = {CLEF 2008 Working Notes},
keywords = {IM2.VP, Report_VIII},
abstract = {This paper presents the algorithms and results of our participation to the medi- cal image annotation task of ImageCLEFmed 2008. Our previous experience in the same task in 2007 suggests that combining multiple cues with di erent SVM-based approaches is very e ective in this domain. Moreover it points out that local features are the most discriminative cues for the problem at hand. On these basis we decided to integrate two di erent local structural and textural descriptors. Cues are combined through simple concatenation of the feature vectors and through the Multi-Cue Ker- nel. The trickiest part of the challenge this year was annotating images coming mainly from classes with only few examples in the training set. We tackled the problem on two fronts: (1) we introduced a further integration strategy using SVM as an opinion maker. It consists in combining the first two opinions on the basis of a technique to evaluate the confidence of the classifier's decisions. This approach produces class labels with ''don't know'' wildcards opportunely placed; (2) we enriched the poorly populated training classes adding virtual examples generated slightly modifying the original images. We submitted several runs considering di erent combination of the proposed techniques. Our team was called ''idiap''. The run using jointly the low cue- integration technique, the confidence-based opinion fusion and the virtual examples, scored 74.92 ranking first among all submissions.},
projects = {Idiap,
EMMA},
}

@inproceedings{Mueller2006,
title = {Tjass, a smart board for augmenting card game playing and learning (demo)},
author = {Müller, M.  and Ev\'equoz, F.  and Lalanne, D. },
booktitle = {Symposium on User Interface Software and Technology (UIST 2006)},
year = {2006},
pages = {67--68},
keywords = {Report_VI, IM2.HMI}
}

@techreport{psompura:rr07-27,
title = {Analysis of confusion matrix to combine evidence for phoneme recognition},
author = {Prasanna, S. R. Mahadeva and Yegnanarayana, B.  and Pinto, J. P.  and Hermansky, H. },
year = {2007},
type = {Idiap-RR-27-2007},
number = {27},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {IM2.AP, Report_VII},
abstract = {In this work we analyze and combine evidences from different classifiers for phoneme recognition using information from the confusion matrices. Speech signals are processed to extract the Perceptual Linear Prediction (PLP) and Multi-RASTA (MRASTA) features. Neural network classifiers with different architectures are built using these features. The classifiers are analyzed using their confusion matrices. The motivation behind this analysis is to come up with some objective measures which indicate the complementary nature of information in each of the classifiers. These measures are useful for combining a subset of classifiers. The classifiers can be combined using different combination schemes like product, sum, minimum and maximum rules. The significance of the objective measures is demonstrated in terms the results of combination. Classifiers selected through the proposed objective measures seem to provide the best performance.}
}

@inproceedings{schlapbach07fusing,
title = {Fusing asynchronous feature streams for on-line writer identification},
author = {Schlapbach, A.  and Bunke, H. },
booktitle = {Proc. 9th Int. Conf. on Document Analysis and Recognition},
year = {2007},
pages = {103--107},
isbn = {978-0-7695-2822-9},
keywords = {Report_VII, IM2.VP},
peer = {yes}
}

@inproceedings{galan:2007:iccn,
title = {Visuo-spatial attention frame recognition for brain-computer interfaces},
author = {Galán, F.  and Palix, J.  and Chavarriaga, R.  and Ferrez, P. W.  and Lew, E.  and Hauert, C. -A.  and Millán, J. del R. },
booktitle = {Proceedings of the 1st International Conference on Cognitive Neurodynamics},
year = {2007},
keywords = {IM2.VP, Report_VII},
abstract = {Objective: To assess the feasibility of recognizing visual spatial attention frames for Brain-computer interfaces (BCI) applications. Methods: EEG data was recorded with 64 electrodes from 2 subjects executing a visual spatial attention task indicating 2 target locations. Continuous Morlet wavelet coefficients were estimated on 18 frequency components and 16 preselected electrodes in trials of 600 ms. The spatial patterns of the 16 frequency components frames were simultaneously detected and classified (between the two targets). The classification accuracy was assessed using 20-fold crossvalidation. Results: The maximum frames average classification accuracies are 80.64\% and 87.31\% for subject 1 and 2 respectively, both utilizing coefficients estimated at frequencies located in gamma band.}
}

@article{jaimes:ieee-computer:2007,
title = {Human-centered computing: toward a human revolution},
author = {Jaimes, A.  and Gatica-Perez, D.  and Sebe, N.  and Huang, T. S. },
journal = {IEEE Computer},
year = {2007},
volume = {40},
number = {5},
note = {IDIAP-RR 07-57},
doi = {10.1109/mc.2007.169},
keywords = {IM2.HCI, Report_VI},
abstract = {Human-centered computing studies the design, development, and deployment of mixed-initiative human-computer systems. HCC is emerging from the convergence of multiple disciplines that are concerned both with understanding human beings and with the design of computational artifacts.}
}

@article{pardo07,
title = {Speaker Diarization For Multiple-Distant-Microphone Meetings Using Several Sources of Information},
author = {Pardo, J. M.  and Anguera, X.  and Wooters, C. },
journal = {to appear in IEEE Transactions on Computers},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{deville:2008:cvavi,
title = {guiding the focus of attention of blind people with visual saliency},
author = {Deville, B.  and Bologna, G.  and Vinckenbosch, M.  and Pun, T. },
booktitle = {Workshop on Computer Vision Applications for the Visually Impaired (CVAVI 08)},
year = {2008},
keywords = {IM2.MCA, Report_VIII},
vgclass = {refpap},
vgproject = {bmi}
}

@inproceedings{GaticaJM09,
title = {Visual attention, speaking activity, and group conversational analysis in multi-sensor environments},
author = {Gatica-Perez, D.  and Odobez, J. -M. },
booktitle = {H. Nakashima, J. Augusto, H. Aghajan (Eds.), Handbook of Ambient Intelligence and Smart Environments, Springer, in press},
year = {\bibnodate},
keywords = {IM2.MPR, Report_VIII}
}

@article{vepa_ieee_sap_2006,
title = {Subjective evaluation of join cost and smoothing methods for unit selection speech synthesis},
author = {Vepa, J.  and King, S. },
journal = {IEEE Trans. on Audio, Speech and Language Processing},
year = {2006},
volume = {14},
number = {5},
pages = {1763--1771},
note = {IDIAP-RR 05-34},
keywords = {Report_VI, IM2.AP},
abstract = {In unit selection-based concatenative speech synthesis, join cost (also known as concatenation cost), which measures how well two units can be joined together, is one of the main criteria for selecting appropriate units from the inventory. Usually, some form of local parameter smoothing is also needed to disguise the remaining discontinuities. This paper presents a subjective evaluation of three join cost functions and three smoothing methods. We also describe the design and performance of a listening test. The three join cost functions were taken from our previous study, where we proposed join cost functions derived from spectral distances, which have good correlations with perceptual scores obtained for a range of concatenation discontinuities. This evaluation allows us to further valiyeartheir ability to predict concatenation discontinuities. The units for synthesis stimuli are obtained from a state-of-the-art unit selection text-to-speech system: rVoice from Rhetorical Systems Ltd. In this paper, we report listeners' preferences for each join cost in combination with each smoothing method.},
ipdmembership = {speech},
ipdxref = {techreport:vepa-rr05-34.bib},
pdf = {ftp://ftp.idiap.ch/pub/reports/2005/rr05-34.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2005/rr05-34.ps.gz}
}

@article{stoyanchev08,
title = {Name-aware speech recognition for interactive question answering},
author = {Stoyanchev, S.  and Tur, G.  and Hakkani-Tur, D. },
journal = {IEEE ICASSP, Las Vegas, NV},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@techreport{mesot:rr06-08,
title = {Switching linear dynamical systems for noise robust speech recognition},
author = {Mesot, B.  and Barber, D. },
year = {2006},
type = {IDIAP-RR},
number = {08},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
abstract = {Real world applications such as hands-free speech recognition of isolated digits may have to deal with potentially very noisy environments. Existing state-of-the-art solutions to this problem use feature-based HMMs, with a preprocessing stage to clean the noisy signal. However, the effect that raw signal noise has on the induced HMM features is poorly understood, and limits the performance of the HMM system. An alternative to feature-based HMMs is to model the raw signal, which has the potential advantage that including an explicit noise model is straightforward. Here we jointly model the dynamics of both the raw speech signal and the noise, using a Switching Linear Dynamical System (SLDS). The new model was tested on isolated digit utterances corrupted by Gaussian noise. Contrary to the SAR-HMM, which provides a model of uncorrupted raw speech, the SLDS is comparatively noise robust and also significantly outperforms a state-of-the-art feature-based HMM. The computational complexity of the SLDS scales exponentially with the length of the time series. To counter this we use Expectation Correction which provides a stable and accurate linear-time approximation for this important class of models, aiding their further application in acoustic modelling.},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/mesot-idiap-rr-06-08.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/mesot-idiap-rr-06-08.ps.gz}
}

@inproceedings{Pinto_ICASSP_2009,
title = {Volterra Series for Analyzing MLP based Phoneme Posterior Probability Estimator},
author = {Pinto, J. P.  and Sivaram, G. S. V. S.  and Hermansky, H.  and Magimai-Doss, M. },
crossref = {Pinto_Idiap-RR-69-2008},
booktitle = {Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
year = {2009},
keywords = {IM2.AP, Report_VIII},
abstract = {We present a framework to apply Volterra series to analyze multilayered perceptrons trained to estimate the posterior probabilities of phonemes in automatic speech recognition. The identified Volterra kernels reveal the spectro-temporal patterns that are learned by the trained system for each phoneme. To demonstrate the applicability of Volterra series, we analyze a multilayered perceptron trained using Mel filter bank energy features and analyze its first order Volterra kernels.},
projects = {Idiap,
SNSF-KEYSPOT,
SNSF-MULTI,
IM2},
}

@phdthesis{Galan_THESIS_2008,
title = {Methods for Asynchronous and Non-Invasive EEG-Based Brain-Computer Interfaces. Towards Intelligent Brain-Actuated Wheelchairs},
author = {Galán, F. },
year = {2008},
month = {June},
school = {University of Barcelona},
keywords = {IM2.BMI,Report_VII},
projects = {Idiap,
IM2},
}

@inproceedings{Hennebert07:icb,
title = {A new forgery scenario based on regaining dynamics of signature},
author = {Hennebert, J.  and Loeffel, R.  and Humm, A.  and Ingold, R. },
booktitle = {Accepted for publication, International Conference on Biometrics (ICB 2007), Seoul Korea},
year = {2007},
keywords = {Report_VI, IM2.MPR}
}

@inproceedings{Rigamonti2007,
title = {Faericworld: browsing multimedia events through static documents and links},
author = {Rigamonti, M.  and Lalanne, D.  and Ingold, R. },
booktitle = {In proc. of INTERACT 2007},
series = {LNCS},
year = {2007},
pages = {to appear},
publisher = {Springer-Verlag},
address = {Rio De Janeiro, Brasil},
keywords = {Report_VI, IM2.HMI}
}

@inproceedings{quelhas:civr:2006,
title = {Natural scene image modeling using color and texture visterms.},
author = {Quelhas, P.  and Odobez, J. -M. },
booktitle = {Conference on Image and Video Retrieval CIVR},
year = {2006},
note = {IDIAP-RR 06-17},
keywords = {Report_VI, IM2.MCA},
abstract = {This paper presents a novel approach for visual scene representation, combining the use of quantized color and texture local invariant features (referred to here as em visterms) computed over interest point regions. In particular we investigate the different ways to fuse together local information from texture and color in order to provide a better em visterm representation. We develop and test our methods on the task of image classification using a 6-class natural scene database. We perform classification based on the em bag-of-visterms (BOV) representation (histogram of quantized local descriptors), extracted from both texture and color features. We investigate two different fusion approaches at the feature level: fusing local descriptors together and creating one representation of joint texture-color visterms, or concatenating the histogram representation of both color and texture, obtained independently from each local feature. On our classification task we show that the appropriate use of color improves the results w.r.t. a texture only representation.},
ipdmembership = {vision},
ipdxref = {techreport:quelhas-idiap-rr-06-17.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2006/quelhas-civr-2006.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2006/quelhas-civr-2006.ps.gz}
}

@article{Kryszczuk2006RelDecisionFusionJASP,
title = {Reliability-based decision fusion in multimodal biometric verification systems},
author = {Kryszczuk, K.  and Richiardi, J.  and Prodanov, P.  and Drygajlo, A. },
journal = {EURASIP Journal of Advances in Signal Processing},
year = {2007},
note = {(in press)},
keywords = {Report_VI, IM2.MPR},
owner = {Jori}
}

@inproceedings{MMSPL-CONF-2009-010,
title = {Two-level bimodal association for audio-visual speech recognition},
author = {Lee, J. -S.  and Ebrahimi, T. },
booktitle = {International Conference on Advanced Concepts for Intelligent Vision Systems (ACIVS\^a09)},
year = {\bibnodate},
location = {Bordeaux, France},
keywords = {audio-visual speech recognition; synchronization; cross-modal correlation; canonical correlation analysis, IM2.MCA, Report_VIII},
abstract = {This paper proposes a new method for bimodal information fusion in audio-visual speech recognition, where cross-modal association is considered in two levels. First, the acoustic and the visual data streams are combined at the feature level by using the canonical correlation analysis, which deals with the problems of audio-visual synchronization and utilizing the cross-modal correlation. Second, information streams are integrated at the decision level for adaptive fusion of the streams according to the noise condition of the given speech datum. Experimental results demonstrate that the proposed method is effective for producing noise-robust recognition performance without a priori knowledge about the noise conditions of the speech data.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/139227},
oai-id = {oai:infoscience.epfl.ch:139227},
oai-set = {conf},
review = {REVIEWED},
status = {ACCEPTED},
unit = {MMSPL}
}

@article{SalKokTosFro-ICPR.08,
title = {3d face recognition using sparse spherical representations},
author = {Llonch, R. Sala and Kokiopoulou, E.  and Tosic, I.  and Frossard, P. },
journal = {IEEE Int. Conf. Pattern Recognition (ICPR)},
year = {2008},
keywords = {Report_VII, IM2.DMA.VP, joint publication}
}

@article{Soleymani:2009:IJSC,
title = {affective characterization of movie scenes based on content analysis and physiological changes},
author = {Soleymani, M.  and Chanel, G.  and Kierkels, J.  and Pun, T. },
journal = {To appear in International Journal of Semantic Computing},
year = {2009},
note = {(to appear)},
keywords = {IM2.MCA, Report_VIII},
owner = {Soleymani},
vgclass = {refpap},
vgproject = {bmi}
}

@inproceedings{Hung:ACM-MM:2007,
title = {Using audio and video features to classify the most dominant person in a group meeting multi-layer background subtraction based on color and texture},
author = {Hung, H.  and Jayagopi, D.  and Yeo, C.  and Friedland, G.  and Ba, S.  and Odobez, J. -M.  and Ramchandran, K.  and Mirghafori, N.  and Gatica-Perez, D. },
booktitle = {Proc. ACM Multi Media, Augsburg, Germany},
year = {2007},
keywords = {Report_VII, IM2.AP.VP, joint publication}
}

@article{stolcke07b,
title = {Speaker Recognition with Session Variability Normalization Based on MLLR Adaptation Transforms},
author = {Stolcke, A.  and Kajarekar, S.  and Ferrer, L.  and Shriberg, E. },
journal = {IEEE Transactions on Audio, Speech, and Language Processing, special issue on speaker and language recognition},
year = {2007},
keywords = {Report_VII, IM2.AP}
}

@article{stolcke07a,
title = {The sri-icsi spring 2007 meeting and lecture recognition system},
author = {Stolcke, A.  and Anguera, X.  and Boakye, K.  and Cetin, O.  and Janin, A.  and Magimai-Doss, M.  and Wooters, C.  and Zheng, J. },
journal = {Lecture Notes in Computer Science},
year = {2007},
keywords = {Report_VII, IM2.AP, joint publication}
}

@phdthesis{keller:phd:2006,
title = {Machine learning approaches to text representation using unlabeled data},
author = {Keller, M. },
year = {2006},
school = {Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {IDIAP-RR 06-76},
keywords = {Report_VI, IM2.MPR.MCA},
abstract = {With the rapid expansion in the use of computers for producing digitalized textual documents, the need of automatic systems for organizing and retrieving the information contained in large databases has become essential. In general, information retrieval systems rely on a formal description or representation of documents enabling their automatic processing. In the most common representation, the so-called bag-of-words, documents are represented by the words composing them and two documents (or a user query and a document) are considered similar if they have a high number of co-occurring words. In this representation, documents with different, but semantically related terms will be considered as unrelated, and documents using the same terms but in different contexts will be seen as similar. It arises quite naturally that information retrieval systems can use the huge amount of existing textual documents in order to learn'', as humans do, the different uses of words depending on the context. This information can be used to enrich documents' representation. In this thesis dissertation we develop several original machine learning approaches which attempt at fulfilling this aim. As a first approach to document representation we propose a probabilistic model in which documents are assumed to be issued from a mixture of distributions over themes, modeled by a hidden variable conditioning a multinomial distribution over words. Simultaneously, words are assumed to be drawn from a mixture of distributions over topics, modeled by a second hidden variable dependent on the themes. As a second approach, we proposed a neural network which is trained to give a score for the appropriateness of a word in a given context. Finally we present, a multi-task learning approach, which is trained jointly to solve an information retrieval task, while learning on unlabeled data to improve its representation of documents.},
ipdmembership = {learning},
ipdxref = {techreport:keller-idiap-rr-06-76.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2006/keller-phd-2006.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2006/keller-phd-2006.ps.gz}
}

@techreport{latorre:rr06-53,
title = {Melanoma recognition using kernel classifiers},
author = {Torre, E. L.  and Caputo, B.  and Tommasi, T. },
year = {2006},
type = {IDIAP-RR},
number = {53},
institution = {IDIAP},
keywords = {Report_VI, IM2.VP},
abstract = {Melanoma is the most deadly skin cancer. Early diagnosis is a current challenge for clinicians. Current algorithms for skin lesions classification focus mostly on segmentation and feature extraction. This paper instead puts the emphasis on the learning process, proposing two kernel-based classifiers: support vector machines, and spin glass-Markov random fields. We benchmarked these algorithms against a state-of-the-art method on melanoma recognition. We show with extensive experiments that the support vector machine approach outperforms the other methods, proving to be an effective classification algorithm for computer assisted diagnosis of melanoma.},
ipdmembership = {vision},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/latorre-idiap-rr-06-53.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/latorre-idiap-rr-06-53.ps.gz}
}

@inproceedings{pinto:SSCS:2008,
title = {Fast approximate spoken term detection from sequence of phonemes},
author = {Pinto, J. P.  and Szoke, I.  and Prasanna, S. R. Mahadeva and Hermansky, H. },
crossref = {pinto:rr08-45},
booktitle = {The 31st Annual International ACM SIGIR Conference 20-24 July 2008},
series = {31st International ACM SIGIR Conference},
year = {2008},
pages = {28--33},
location = {Singapore,},
note = {IDIAP-RR 08-45},
keywords = {IM2.AP, Report_VII},
abstract = {We investigate the detection of spoken terms in conversational speech using phoneme recognition with the objective of achieving smaller index size as well as faster search speed. Speech is processed and indexed as a sequence of one best phoneme sequence. We propose the use of a probabilistic pronunciation model for the search term to compensate for the errors in the recognition of phonemes. This model is derived using the pronunciation of the word and the phoneme confusion matrix. Experiments are performed on the conversational telephone speech database distributed by NIST for the 2006 spoken term detection. We achieve about 1500 times smaller index size and 14 times faster search speed compared to the state-of-the-art system using phoneme lattice at the cost of relatively lower detection performance.}
}

@inproceedings{riesen07graphEmbedding,
title = {Graph embedding in vector spaces by means of prototype selection},
author = {Riesen, K.  and Neuhaus, M.  and Bunke, H. },
editor = {Escolano, F.  and Vento, M. },
booktitle = {Graph-Based Representations in Pattern Recognition},
series = {Lecture Notes in Computer Science},
year = {2007},
volume = {4538},
pages = {383--393},
publisher = {Springer},
keywords = {Report_VI, IM2.ACP},
peer = {yes}
}

@inproceedings{Villan:SPIE2007:RH,
title = {Tamper-proofing of Electronic and Printed Text Documents via Robust Hashing and Data-Hiding},
author = {Villán, R.  and Voloshynovskiy, S.  and Koval, O.  and Deguillaume, F.  and Pun, T. },
booktitle = {Proceedings of SPIE-IS&T Electronic Imaging 2007, Security, Steganography, and Watermarking of Multimedia Contents IX},
year = {2007},
url = {http://vision.unige.ch/publications/postscript/2007/SPIE-EI-2007-Robust-Hashing-pa per.pdf},
keywords = {Report_VI, IM2.MPR}
}

@phdthesis{zhang-thesis,
title = {Probabilistic graphical models for human interaction analysis},
author = {Zhang, D. },
year = {2006},
school = {\'Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {thesis # (IDIAP-RR 06-78)},
keywords = {Report_VI, IM2.MPR.HMI},
abstract = {The objective of this thesis is to develop probabilistic graphical models for analyzing human interaction in meetings based on multimodel cues. We use meeting as a study case of human interactions since research shows that high complexity information is mostly exchanged through face-to-face interactions. Modeling human interaction provides several challenging research issues for the machine learning community. In meetings, each participant is a multimodal data stream. Modeling human interaction involves simultaneous recording and analysis of multiple multimodal streams. These streams may be asynchronous, have different frame rates, exhibit different stationarity properties, and carry complementary (or correlated) information. In this thesis, we developed three probabilistic graphical models for human interaction analysis. The proposed models use the probabilistic graphical model'' formalism, a formalism that exploits the conjoined capabilities of graph theory and probability theory to build complex models out of simpler pieces. We first introduce the multi-layer framework, in which the first layer models typical individual activity from low-level audio-visual features, and the second layer models the interactions. The two layers are linked by a set of posterior probability-based features. Next, we describe the team-player influence model, which learns the influence of interacting Markov chains within a team. The team-player influence model has a two-level structure: individual-level and group-level. Individual level models actions of each player, and the group-level models actions of the team as a whole. The influence of each player on the team is jointly learned with the rest of the model parameters in a principled manner using the Expectation-Maximization (EM) algorithm. Finally, we describe the semi-supervised adapted HMMs for unusual event detection. Unusual events are characterized by a number of features (rarity, unexpectedness, and relevance) that limit the application of traditional supervised model-based approaches. We propose a semi-supervised adapted Hidden Markov Model (HMM) framework, in which usual event models are first learned from a large amount of (commonly available) training data, while unusual event models are learned by Bayesian adaptation in an unsupervised manner.},
ipdmembership = {Vision},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/rr06-78.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/rr06-78.ps.gz}
}

@techreport{Ganapathy_Idiap-RR-74-2008,
title = {MODIFIED DISCRETE COSINE TRANSFORM FOR ENCODING RESIDUAL SIGNALS IN FREQUENCY DOMAIN LINEAR PREDICTION},
author = {Ganapathy, S.  and Motlicek, P.  and Hermansky, H. },
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-74-2008},
institution = {Idiap},
keywords = {IM2.AP, Report_VIII},
projects = {Idiap,
IM2},
}

@inproceedings{Weinshall_NIPS_2008,
title = {Beyond Novelty Detection: Incongruent Events, when General and Specific Classifiers Disagree},
author = {Weinshall, D.  and Hermansky, H.  and Zweig, A.  and Luo, J.  and Jimison, H.  and Ohl, F.  and Pavel, M. },
booktitle = {Advances in Neural Information Processing Systems 21},
year = {2008},
keywords = {IM2.MPR, Report_VIII},
abstract = {Unexpected stimuli are a challenge to any machine learning algorithm. Here we identify distinct types of unexpected events, focusing on 'incongruent events' - when 'general level' and 'specific level' classifiers give conflicting predictions. We define a formal framework for the representation and processing of incongruent events: starting from the notion of label hierarchy, we show how partial order on labels can be deduced from such hierarchies. For each event, we compute its probability in different ways, based on adjacent levels (according to the partial order) in the label hierarchy. An incongruent event is an event where the probability computed based on some more specific level (in accordance with the partial order) is much smaller than the probability computed based on some more general level, leading to conflicting predictions. We derive algorithms to detect incongruent events from different types of hierarchies, corresponding to class membership or part membership. Respectively, we show promising results with real data on two specific problems: Out Of Vocabulary words in speech recognition, and the identification of a new sub-class (e.g., the face of a new individual) in audio-visual facial object recognition.},
projects = {DIRAC},
}

@inproceedings{DrygWeifZhu1,
title = {Q-stack aging model for face verification},
author = {Drygajlo, A.  and Li, W.  and Zhu, K. },
booktitle = {17th European Signal Processing Conference},
year = {2009},
keywords = {IM2.MPR, Report_VIII}
}

@techreport{Mesot2007a,
title = {A bayesian switching linear dynamical system for scale-invariant robust speech extraction},
author = {Mesot, B.  and Barber, D. },
year = {2007},
institution = {Idiap Research Institute},
keywords = {Report_VII, IM2.AP},
abstract = {Most state-of-the-art automatic speech recognition (ASR) systems deal with noise in the environment by extracting noise robust features which are subsequently modelled by a Hidden Markov Model (HMM). A limitation of this feature-based approach is that the influence of noise on the features is difficult to model explicitly and the HMM is typically over sensitive, dealing poorly with unexpected and severe noise environments. An alternative is to model the raw signal directly which has the potential advantage of allowing noise to be explicitly modelled. A popular way to model raw speech signals is to use an Autoregressive (AR) process. AR models are however very sensitive to variations in the amplitude of the signal. Our proposed Bayesian Autoregressive Switching Linear Dynamical System (BAR-SLDS) treats the observed noisy signal as a scaled, clean hidden signal plus noise. The variance of the noise and signal scaling factor are automatically adapted, enabling the robust identification of scale-invariant clean signals in the presence of noise.}
}

@inproceedings{Zhao:ICIS:2009,
title = {Multi-stream to many-stream: using spectro-temporal features for asr},
author = {Zhao, S. Y.  and Ravuri, R.  and Morgan, N. },
booktitle = {10th International Conference of the International Speech Communication Association, Brighton, UK},
year = {2009},
keywords = {IM2.AP, Report_VIII}
}

@inproceedings{aradilla:mlmi:2007,
title = {Posterior-based features and distances in template matching for speech recognition},
author = {Aradilla, G.  and Bourlard, H. },
booktitle = {4th Joint Workshop on Multimodal Interaction and Related Machine Learning Algorithms (MLMI)},
year = {2007},
volume = {4892},
pages = {204--214},
note = {IDIAP-RR 07-41},
doi = {10.1007/978-3-540-78155-4_18},
keywords = {Report_VII, IM2.AP},
abstract = {The use of large speech corpora in example-based approaches for speech recognition is mainly focused on increasing the number of examples. This strategy presents some difficulties because databases may not provide enough examples for some rare words. In this paper we present a different method to incorporate the information contained in such corpora in these example-based systems. A multilayer perceptron is trained on these databases to estimate speaker and task-independent phoneme posterior probabilities, which are used as speech features. By reducing the variability of features, fewer examples are needed to properly characterize a word. In this way, performance can be highly improved when limited number of examples is available. Moreover, we also study posterior-based local distances, these result more effective than traditional Euclidean distance. Experiments on Phonebook database support the idea that posterior features with a proper local distance can yield competitive results.}
}

@phdthesis{ferrez_thesis_2007,
title = {Error-related eeg potentials in brain-computer interfaces},
author = {Ferrez, P. W. },
year = {2007},
school = {\'Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {PhD Thesis #3928 at the \'Ecole Polytechnique F\'ed\'erale de Lausanne},
keywords = {IM2.BCI, Report_VII},
abstract = {People with severe motor disabilities (spinal cord injury (SCI), amyotrophic lateral sclerosis (ALS), etc.) but with intact brain functions are somehow prisoners of their own body. They need alternative ways of communication and control to interact with their environment in their everyday life. These new tools are supposed to increase their quality of life by giving these people the opportunity to recover part of their independence. Therefore, these alternative ways have to be reliable and ergonomic to be successfully used by disabled people. Over the past two decades, numerous studies proposed electroencephalogram (EEG) activity for direct brain-computer interaction. EEG-based brain-computer interfaces (BCIs) provide disabled people with new tools for control and communication and are promising alternatives to invasive methods. However, as any other interaction modality based on physiological signals and body channels (muscular activity, speech and gestures, etc.), BCIs are prone to errors in the recognition of subject\^As intent, and those errors can be frequent. Indeed, even well-trained subjects rarely reach 100 percent of success. In contrast to other interaction modalities, a unique feature of the brain channel is that it conveys both information from which we can derive mental control commands to operate a brain-actuated device as well as information about cognitive states that are crucial for a purposeful interaction, all this on the millisecond range. One of these states is the awareness of erroneous responses, which a number of groups have recently proposed as a way to improve the performance of BCIs. However, most of these studies propose the use of error-related potentials (ErrP) following an error made by the subject himself. This thesis first describes a new kind of ErrP, the so-called interaction ErrP, that are present in the ongoing EEG following an error of the interface and no longer errors of the subject himself. More importantly, these ErrP are satisfactorily detected no more in grand averages but at the level of single trials. Indeed, the classification rates of both error and correct single trials based on error-potentials detection are on average 80 percent. At this level it becomes possible to introduce a kind of automatic verification procedure in the BCI: after translating the subject\^As intention into a control command, the BCI provides a feedback of that command, but will not transfer it to the device if ErrP follow the feedback. Experimental results presented in this thesis confirm that this new protocol greatly increases the reliability of the BCI. Furthermore, this tool turns out to be of great benefit especially for beginners who normally reach moderate performances. Indeed, filtering out wrong responses increases the user\^As confidence in the interface and thus accelerates mastering the control of the brainactuated device. The second issue explored in this thesis is the practical integration of ErrP detection in a BCI. Indeed, providing a first feedback of the subject\^As intent, as recognized by the BCI, before eventually sending the command to the controlled device, induces additional information to process by the subject and may considerably slow down the interaction since the introduction of an automatic response rejection strongly interferes with the BCI. However, this study shows the feasibility of simultaneously and satisfactorily detecting erroneous responses of the interface and classifying motor imagination for device control at the level of single trials. The integration of an automatic error detection procedure leads to great improvements of the BCI performance. Another aspect of this thesis is to investigate the potential benefit of using neurocognitive knowledge to increase the classification rate of ErrP, and more generally the performance of the BCI. Recent findings have uncovered that ErrP are most probably generated in a deep fronto-central brain area called anterior cingulate cortex (ACC). This hypothesis is verified using a well-known inverse model called sLORETA. Indeed, the localization provided for ErrP shows clear foci of activity both in the ACC and the pre-supplementary motor area (pre-SMA). The localization results using the cortical current density (CCD) model are very similar and more importantly, this model outperforms EEG for ErrP classification. Thanks to its stability, this model is likely to be successfully used in a BCI framework. The ELECTRA model for estimating local field potentials is also tested, but classification and localization results using this method are not so encouraging. More generally, the work described here suggests that it could be possible to recognize in real time high-level cognitive and emotional states from EEG (as opposed, and in addition, to motor commands) such as alarm, fatigue, frustration, confusion, or attention that are crucial for an effective and purposeful interaction. Indeed, the rapid recognition of these states will lead to truly adaptive interfaces that customize dynamically in response to changes of the cognitive and emotional/affective states of the user.}
}

@techreport{Mariethoz_Idiap-RR-68-2008,
title = {Kernel Based Text-Independnent Speaker Verification},
author = {Mari\'ethoz, J.  and Bengio, S.  and Grandvalet, Y. },
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-68-2008},
institution = {Idiap},
keywords = {IM2.AP, Report_VIII},
projects = {Idiap,
IM2},
}

@techreport{Hung_Idiap-RR-20-2009,
title = {Speech/Non-Speech Detection in Meetings from Automatically Extracted Low Resolution Visual Features},
author = {Hung, H.  and Ba, S. },
year = {2009},
type = {Idiap-RR},
number = {Idiap-RR-20-2009},
institution = {Idiap},
note = {submitted to icmi-mlmi},
keywords = {IM2.MCA, IM2.MPR, Report_VIII},
abstract = {In this paper we address the problem of estimating who is speaking from automatically extracted low resolution visual cues from group meetings. Traditionally, the task of speech/non-speech detection or speaker diarization tries to find who speaks and when from audio features only. Recent work has addressed the problem audio-visually but often with less emphasis on the visual component. Due to the high probability of losing the audio stream during video conferences, this work proposes methods for estimating speech using just low resolution visual cues. We carry out experiments to compare how context through the observation of group behaviour and task-oriented activities can help improve estimates of speaking status. We test on 105 minutes of natural meeting data with unconstrained conversations.},
projects = {Idiap,
IM2},
}

@techreport{orabona:rr07-63,
title = {On-line independent support vector machines for cognitive systems},
author = {Orabona, F.  and Castellini, C.  and Caputo, B.  and Luo, J.  and Sandini, G. },
year = {2007},
type = {Idiap-RR},
number = {Idiap-RR-63-2007},
institution = {IDIAP},
keywords = {IM2.MPR, Report_VII},
abstract = {Learning from experience and adapting to changing stimuli are fundamental capabilities for artificial cognitive systems. This calls for on-line learning methods able to achieve high accuracy while at the same time using limited computer power. Research on autonomous agents has been actively investigating these issues, mostly using probabilistic frameworks and within the context of navigation and learning by imitation. Still, recent results on robot localization have clearly pointed out the potential of discriminative classifiers for cognitive systems. In this paper we follow this approach and propose an on-line version of the Support Vector Machine (SVM) algorithm. Our method, that we call On-line Independent SVM, builds a solution on-line, achieving an excellent accuracy vs. compactness trade-off. In particular the size of the obtained solution is always bounded, implying a bounded testing time. At the same time, the algorithm converges to the optimal solution at each incremental step, as opposed to similar approaches where optimality is achieved in the limit of infinite number of training data. These statements are supported by experiments on standard benchmark databases as well as on two real-world applications, namely $(a)$ place recognition by a mobile robot in an indoor environment, and $(b)$ human grasping posture classification.}
}

@inproceedings{Gelbart:ICIS:2009,
title = {Hill-climbing feature selection for multi-stream asr},
author = {Gelbart, D.  and Morgan, N.  and Tsymbal, A. },
booktitle = {10th International Conference of the International Speech Communication Association, Brighton, UK},
year = {2009},
keywords = {IM2.AP, Report_VIII}
}

@inproceedings{liwicki07novel,
title = {A novel approach to on-line handwriting recognition based on bidirectional long short-term memory networks},
author = {Liwicki, M.  and Graves, A.  and Bunke, H.  and Schmidhuber, J. },
booktitle = {Proc. 9th Int. Conf. on Document Analysis and Recognition},
year = {2007},
pages = {367--371},
isbn = {978-0-7695-2822-9},
keywords = {Report_VII, IM2.VP},
peer = {yes}
}

@inproceedings{frinken09improving,
title = {Improving graph classification by isomap},
author = {Frinken, V.  and Riesen, K.  and Bunke, H. },
editor = {Torsello, A.  and Escolano, F.  and Brun, L. },
booktitle = {Graph-Based Representations in Pattern Recognition},
series = {LNCS 5534},
year = {2009},
pages = {205--214},
publisher = {Springer},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@article{Prodanov2007LowLevelGrounding,
title = {Low-level grounding in a multimodal mobile service robot conversational system using graphical models},
author = {Prodanov, P.  and Drygajlo, A.  and Richiardi, J.  and Alexander, A. },
journal = {Intelligent Service Robotics},
year = {2008},
volume = {1},
pages = {3--26},
doi = {10.1007/s11370-006-0001-9},
keywords = {report_VII, IM2.MPR}
}

@article{vinyals07,
title = {Revisiting a basic function on current CPUs: A fast logarithm implementation with adjustable accuracy},
author = {Vinyals, O.  and Friedland, G.  and Mirghafori, N. },
journal = {ICSI Technical Report number TR-07-002},
year = {2007},
keywords = {Report_VII, IM2.AP}
}

@inproceedings{BBH-eusipco08,
title = {The spherical approach to omnidirectional visual attention},
author = {{I. Bogdanova}  and {A. Bur}  and Hügli, H. },
booktitle = {XVI European Signal Processing Conference (EUSIPCO 2008)},
series = {Proc. EUSIPCO},
year = {2008},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{gangadhar:grazBCI2008:2008,
title = {Recognition of anticipatory behavior from human eeg},
author = {Garipelli, G.  and Chavarriaga, R.  and Millan, J. del R. },
booktitle = {4th Intl. Brain-Computer Interface Workshop and Training Course},
year = {2008},
organization = {Graz University, Austria},
note = {IDIAP-RR 08-52},
keywords = {IM2.BMI, Report_VII},
abstract = {Anticipation increases the efficiency of a daily task by partial advance activation of neural substrates involved in it. Single trial recognition of this activation can be exploited for a novel anticipation based Brain Computer Interface (BCI). In the current work we compare different methods for the recognition of Electroencephalogram (EEG) correlates of this activation on single trials as a first step towards building such a BCI. To do so, we recorded EEG from 9 subjects performing a classical Contingent Negative Variation (CNV) paradigm (usually reported for studying anticipatory behavior in neurophysiological experiments) with GO and NOGO conditions. We first compare classification accuracies with features such as Least Square fitting Line (LSFL) parameters and Least Square Fitting Polynomial (LSFP) coefficients using a Quadratic Discriminant Analysis (QDA) classifier. We then test the best features with complex classifiers such as Gaussian Mixture Models (GMMs) and Support Vector Machines (SVMs).}
}

@article{Voloshynovskiy:2007:SPIE-Biometric,
title = {Authentication of biometric identification documents via mobile devices},
author = {Voloshynovskiy, S.  and Koval, O.  and Villán, R.  and Beekhof, F.  and Pun, T. },
journal = {Journal of Electronic Imaging},
year = {2008},
keywords = {Report_VII, IM2.MPR}
}

@inbook{Drygajlo2006ManMachine,
title = {Man-machine voice communication},
author = {Drygajlo, A. },
editor = {Rajman, M. },
booktitle = {Speech and Language Engineering},
year = {2007},
pages = {433--461},
publisher = {EPFL Press},
doi = {10.1016/j.forsciint.2006.06.037},
keywords = {Report_VI, IM2.MPR},
owner = {Andrzej}
}

@inproceedings{Hennebert07:icassp,
title = {Modelling spoken signatures with gaussian mixture model adaptation},
author = {Hennebert, J.  and Humm, A.  and Ingold, R. },
booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 07)},
year = {2007},
keywords = {Report_VI, IM2.MPR}
}

@phdthesis{Smith_THESIS_2007,
title = {Bayesian methods for visual multi-object tracking with applications to human activity recognition},
author = {Smith, K. },
year = {2007},
school = {\'Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {Th\ese sciences Ecole polytechnique f\'ed\'erale de Lausanne EPFL, no 3745 (2007), Facult\'e des sciences et techniques de l'ing\'enieur STI, Section de g\'enie \'electrique et \'electronique, Institut de g\'enie \'electrique et \'electronique IEL (Laboratoire de l'IDIAP LIDIAP). Dir.: Herv\'e Bourlard, Daniel Gatica-Perez},
keywords = {IM2.VP, Report_VII},
abstract = {In recent years, we have seen a dramatic increase in the amount of video data recorded and stored around the world. Driven by the availability of low-cost video cameras, the ever-decreasing cost of digital media storage, and the explosion in popularity of video sharing across the Internet, there is a growing demand for sophisticated methods to automatically analyze and understand video content. One of the most fundamental processes to understanding video content is visual multi-object tracking, which is the process of locating, identifying, and determining the dynamic configuration of one or many moving (possibly deformable) objects in each frame of a video sequence. In this dissertation, we focus on a general probabilistic approach known as recursive state-space Bayesian estimation, which estimates the unknown probability distribution of the state of the objects recursively over time, using information extracted from video data. The central problem addressed in this dissertation is the development of novel probabilistic models using this framework to perform accurate, robust automatic visual multi-object tracking. In addressing this problem, we consider the following questions: What types of probabilistic models can we develop to improve the state-of-the-art, and where do the improvements come from? What benefits and drawbacks are associated with these models? How can we objectively evaluate the performance of a multi-object tracking model? How can a probabilistic multi-object tracking model be extended to perform human activity recognition tasks? Over the course of our work, we attempt to provide an answer to each of these questions, beginning with a proposal for a comprehensive set of measures and a formal evaluation protocol for evaluating multi-object tracking performance. We proceed by defining two new probabilistic tracking models: one which improves the efficiency of a state-of-the-art model, the Distributed Partitioned Sampling Particle Filter (DPS PF), and one which provides a formal framework for efficiently tracking a variable number of objects, the Reversible Jump Markov Chain Monte Carlo Particle Filter (RJMCMC PF). Using our proposed evaluation framework, we compare our proposed models with other state-of-the-art tracking methods in a meeting room head tracking task. Finally, we show how the RJMCMC PF can be applied to human activity recognition tasks such as detecting abandoned luggage items in a busy train terminal and determining if and when pedestrians look at an outdoor advertisement as they pass.}
}

@techreport{aradilla:rr08-14,
title = {Using kl-based acoustic models in a large vocabulary recognition task},
author = {Aradilla, G.  and Bourlard, H.  and Magimai-Doss, M. },
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-14-2008},
institution = {IDIAP},
keywords = {IM2.AP, Report_VII},
abstract = {Posterior probabilities of sub-word units have been shown to be an effective front-end for ASR. However, attempts to model this type of features either do not benefit from modeling context-dependent phonemes, or use an inefficient distribution to estimate the state likelihood. This paper presents a novel acoustic model for posterior features that overcomes these limitations. The proposed model can be seen as a HMM where the score associated with each state is the KL divergence between a distribution characterizing the state and the posterior features from the test utterance. This KL-based acoustic model establishes a framework where other models for posterior features such as hybrid HMM/MLP and discrete HMM can be seen as particular cases. Experiments on the WSJ database show that the KL-based acoustic model can significantly outperform these latter approaches. Moreover, the proposed model can obtain comparable results to complex systems, such as HMM/GMM, using significantly fewer parameters.}
}

@inproceedings{millan:2007:hcii,
title = {Brain-machine interfaces through control of electroencephalographic signals and vibrotactile feedback},
author = {Aloise, F.  and Caporusso, N.  and Mattia, D.  and Babiloni, F.  and Kauhanen, L.  and Millán, J. del R.  and Nuttin, M.  and Marciani, M. G.  and Cincotti, F. },
booktitle = {Proceedings of the 12th International Conference on Human-Computer Interaction},
year = {2007},
volume = {125},
keywords = {IM2.BCI, Report_VII},
abstract = {A Brain-Computer Interface (BCI) allow direct expression of its userü{\i}\^A?\^Afrac1/2s will by interpreting signals which directly reflect the brainü{\i}\^A?\^Afrac1/2s activity, thus bypassing the natural efferent channels (nerves and muscles). To be correctly mastered, it is needed that this artificial efferent channel is complemented by an artificial feedback, which continuously informs the user about the current state (in the same way as proprioceptors give a feedback about joint angle and muscular tension). This feedback is usually delivered through the visual channel. We explored the benefits of vibrotactile feedback during usersü{\i}\^A?\^Afrac1/2 training and control of EEG-based BCI applications. A protocol for delivering vibrotactile feedback, including specific hardware and software arrangements, was specified and implemented. Thirteen subjects participated in an experiment where the feedback of the BCI system was delivered either through a visual display, or through a vibrotactile display, while they performed a virtual navigation task. Attention to the task was probed by presenting visual cues that the subjects had to describe afterwards. When compared with visual feedback, the use of tactile feedback did not decrease BCI control performance; on the other side, it improved the capacity of subjects to concentrate on the requested (visual) task. During experiments, vibrotactile feedback felt (after some training) more natural. This study indicated that the vibrotactile channel can function as a valuable feedback modality in the context of BCI applications. Advantages of using a vibrotactile feedback emerged when the visual channel was highly loaded by a complex task.}
}

@inproceedings{negoescu:civr-08,
title = {Analyzing flickr groups},
author = {Negoescu, R. -A.  and Gatica-Perez, D. },
booktitle = {Proceedings of the 2008 international conference on Content-based image and video retrieval (CIVR '08)},
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-03-2008},
institution = {IDIAP},
location = {Sheraton Fallsview Hotel, Niagara Falls, Canada},
isbn = {978-1-60558-070-8},
note = {To appear in Proceedings of CIVR'08},
keywords = {IM2.VP, Report_VII},
abstract = {There is an explosion of community-generated multimedia content available online. In particular, Flickr constitutes a 200-million photo sharing system where users participate following a variety of social motivations and themes. Flickr groups are increasingly used to facilitate the explicit definition of communities sharing common interests, which translates into large amounts of content (e.g. pictures and associated tags) about specific subjects. However, to our knowledge, an in-depth analysis of user behavior in Flickr groups remains open, as does the existence of effective tools to find relevant groups. Using a sample of about 7 million user-photos and about 51000 Flickr groups, we present a novel statistical group analysis that highlights relevant patterns of photo-to-group sharing practices. Furthermore, we propose a novel topic-based representation model for groups, computed from aggregated group tags. Groups are represented as multinomial distributions over semantically meaningful latent topics learned via unsupervised probabilistic topic modeling. We show this representation to be useful for automatically discovering groups of groups and topic expert-groups, for designing new group-search strategies, and for obtaining new insights of the semantic structure of Flickr groups.}
}

@inproceedings{Aradilla2007,
title = {Detection and recognition of number sequences within spoken utterances},
author = {Aradilla, G.  and Ajmera, J. },
booktitle = {2nd Workshop on Speech in Mobile and Pervasive Environments},
year = {2007},
keywords = {Report_VII, IM2.AP},
abstract = {In this paper we investigate the detection and recognition of sequences of numbers in spoken utterances. This is done in two steps: first, the entire utterance is decoded assuming that only numbers were spoken. In the second step, non-number segments (garbage) are detected based on word confidence measures. We compare this approach to conventional garbage models. Also, a comparison of several phone posterior based confidence measures is presented in this paper. The work is evaluated in terms of detection task (hit rate and false alarms) and recognition task (word accuracy) within detected number sequences. The proposed method is tested on German continuous spoken utterances where target content (numbers) is only 20\%.}
}

@inproceedings{monay:phd:2007,
title = {Learning the structure of image collections with latent aspect models},
author = {Monay, F. },
booktitle = {.},
year = {2007},
note = {IDIAP-RR 07-06},
keywords = {Report_VI, IM2.MCA},
abstract = {The approach to indexing an image collection depends on the type of data to organize. Satellite images are likely to be searched with latitude and longitude coordinates, medical images are often searched with an image example that serves as a visual query, and personal image collections are generally browsed by event. A more general retrieval scenario is based on the use of textual keywords to search for images containing a specific object, or representing a given scene type. This requires the manual annotation of each image in the collection to allow for the retrieval of relevant visual information based on a text query. This time-consuming and subjective process is the current price to pay for a reliable and convenient text-based image search. This dissertation investigates the use of probabilistic models to assist the automatic organization of image collections, attempting to link the visual content of digital images with a potential textual description. Relying on robust, patch-based image representations that have proven to capture a variety of visual content, our work proposes to model images as mixtures of emphlatent aspects. These latent aspects are defined by multinomial distributions that capture patch co-occurrence information observed in the collection. An image is not represented by the direct count of its constituting elements, but as a mixture of latent aspects that can be estimated with principled, generative unsupervised learning methods. An aspect-based image representation therefore incorporates contextual information from the whole collection that can be exploited. This emerging concept is explored for several fundamental tasks related to image retrieval - namely classification, clustering, segmentation, and annotation - in what represents one of the first coherent and comprehensive study of the subject. We first investigate the possibility of classifying images based on their estimated aspect mixture weights, interpreting latent aspect modeling as an unsupervised feature extraction process. Several image categorization tasks are considered, where images are classified based on the present objects or according to their global scene type. We demonstrate that the concept of latent aspects allows to take advantage of non-labeled data to infer a robust image representation that achieves a higher classification performance than the original patch-based representation. Secondly, further exploring the concept, we show that aspects can correspond to an interesting soft clustering of an image collection that can serve as a browsing structure. Images can be ranked given an aspect, illustrating the corresponding co-occurrence context visually. In the third place, we derive a principled method that relies on latent aspects to classify image patches into different categories. This produces an image segmentation based on the resulting spatial class-densities. We finally propose to model images and their caption with a single aspect model, merging the co-occurrence contexts of the visual and the textual modalities in different ways. Once a model has been learned, the distribution of words given an unseen image is inferred based on its visual representation, and serves as textual indexing. Overall, we demonstrate with extensive experiments that the co-occurrence context captured by latent aspects is suitable for the above mentioned tasks, making it a promising approach for multimedia indexing.},
ipdmembership = {vision},
ipdxref = {techreport:monay-idiap-rr-07-06.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/monay-phd-2007.pdf}
}

@article{AndreasHumm20091,
title = {Combined handwriting and speech modalities for user authentication},
author = {Humm, A.  and Hennebert, J.  and Ingold, R. },
journal = {IEEE Transactions on Systems, Man, and Cybernetics, Part A: Systems and Humans},
year = {2009},
volume = {39},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{millan:2007:bvai,
title = {Non-invasive brain-actuated interaction},
author = {Millán, J. del R.  and Ferrez, P. W.  and Galán, F.  and Lew, E.  and Chavarriaga, R. },
booktitle = {Proceedings of the 2nd International Symposium on Brain, Vision and Artificial Intelligence},
year = {2007},
volume = {4729},
issn = {0302-9743},
doi = {10.1007/978-3-540-75555-5},
keywords = {IM2.BMI, Report_VII},
abstract = {The promise of Brain-Computer Interfaces (BCI) technology is to augment human capabilities by enabling interaction with computers through a conscious and spontaneous modulation of the brainwaves after a short training period. Indeed, by analyzing brain electrical activity online, several groups have designed brain-actuated devices that provide alternative channels for communication, entertainment and control. Thus, a person can write messages using a virtual keyboard on a computer screen and also browse the internet. Alternatively, subjects can operate simple computer games, or brain games, and interact with educational software. Work with humans has shown that it is possible for them to move a cursor and even to drive a wheelchair. This paper briefly reviews the field of BCI, with a focus on non-invasive systems based on electroencephalogram (EEG) signals. It also describes three brain-actuated devices we have developed: a virtual keyboard, a brain game, and a wheelchair. Finally, it shortly discusses current research directions we are pursuing in order to improve the performance and robustness of our BCI system, especially for real-time control of brain-actuated robots.}
}

@inproceedings{Kryszczuk2007RelEstimationPRIS,
title = {Reliability estimation for multimodal error prediction and fusion},
author = {Kryszczuk, K.  and Richiardi, J.  and Drygajlo, A. },
booktitle = {Proc. 7th Int. Workshop on Pattern Recognition in Information Systems (PRIS 2007)},
year = {2007},
keywords = {Report_VI, IM2.MPR},
owner = {Jori}
}

@inproceedings{AndreasHumm20099,
title = {Spoken handwriting for user authentication using joint modelling systems},
author = {Humm, A.  and Ingold, R.  and Hennebert, J. },
booktitle = {Proceedings of 6th International Symposium on Image and Signal Processing and Analysis (ISPA'09)},
year = {2009},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{RichKryszDryg1,
title = {Static models of derivative-coordinates phase spaces for multivariate time series classification: an application to signature verification},
author = {Richiardi, J.  and Kryszczuk, K.  and Drygajlo, A. },
booktitle = {Advances in Biometrics, Lecture Notes in Computer Science 5558},
year = {2009},
pages = {1200--1208},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{Kludas2008:gfkla,
title = {Exploiting synergistic and redundant features for multimedia document classification},
author = {Kludas, J.  and Bruno, E.  and Marchand-Maillet, S. },
booktitle = {32nd Annual Conference of the German Classification Society - Advances in Data Analysis, Data Handling and Business Intelligence (GfKl 2008)},
year = {2008},
keywords = {Report_VII, IM2.MCA}
}

@phdthesis{Mesot_THESIS_2008,
title = {Inference in switching linear dynamical systems applied to noise robust speech recognition of isolated digits},
author = {Mesot, B. },
crossref = {mesot:rr08-35},
year = {2008},
month = {May},
school = {Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {Th\ese Ecole polytechnique f\'ed\'erale de Lausanne EPFL, no 4059 (2008), Facult\'e des sciences et techniques de l'ing\'enieur STI, Section de g\'enie \'electrique et \'electronique, Institut de g\'enie \'electrique et \'electronique IEL (Laboratoire de l'IDIAP LIDIAP). Dir.: Herv\'e Bourlard},
keywords = {Report_VII,IM2.AP},
abstract = {Real world applications such as hands-free dialling in cars may have to perform recognition of spoken digits in potentially very noisy environments. Existing state-of-the-art solutions to this problem use feature-based Hidden Markov Models (HMMs), with a preprocessing stage to clean the noisy signal. However, the effect that the noise has on the induced HMM features is difficult to model exactly and limits the performance of the HMM system. An alternative to feature-based HMMs is to model the clean speech waveform directly, which has the potential advantage that including an explicit model of additive noise is straightforward. One of the most simple model of the clean speech waveform is the autoregressive (AR) process. Being too simple to cope with the nonlinearity of the speech signal, the AR process is generally embedded into a more elaborate model, such as the Switching Autoregressive HMM (SAR-HMM). In this thesis, we extend the SAR-HMM to jointly model the clean speech waveform and additive Gaussian white noise. This is achieved by using a Switching Linear Dynamical System (SLDS) whose internal dynamics is autoregressive. On an isolated digit recognition task where utterances have been corrupted by additive Gaussian white noise, the proposed SLDS outperforms a state-of-the-art HMM system. For more natural noise sources, at low signal to noise ratios (SNRs), it is also significantly more accurate than a feature-based HMM system. Inferring the clean waveform from the observed noisy signal with a SLDS is formally intractable, resulting in many approximation strategies in the literature. In this thesis, we present the Expectation Correction (EC) approximation. The algorithm has excellent numerical performance compared to a wide range of competing techniques, and provides a stable and accurate linear-time approximation which scales well to long time series such as those found in acoustic modelling. A fundamental issue faced by models based on AR processes is that they are sensitive to variations in the amplitude of the signal. One way to overcome this limitation is to use Gain Adaptation (GA) to adjust the amplitude by maximising the likelihood of the observed signal. However, adjusting model parameters without constraint may lead to overfitting when the models are sufficiently flexible. In this thesis, we propose a statistically principled alternative based on an exact Bayesian procedure in which priors are explicitly defined on the parameters of the underlying AR process. Compared to GA, the Bayesian approach enhances recognition accuracy at high SNRs, but is slightly less accurate at low SNRs.},
projects = {Idiap},
}

@inproceedings{Garau_ACMMULTIMEDIA_2009,
title = {Investigating the use of Visual Focus of Attention for Audio-Visual Speaker Diarisation},
author = {Garau, G.  and Ba, S.  and Bourlard, H.  and Odobez, J. -M. },
booktitle = {Proceedings of the ACM International Conference on Multimedia},
year = {2009},
location = {Beijing, China},
keywords = {IM2.MPR, Report_VIII},
abstract = {Audio-visual speaker diarisation is the task of estimating who spoke when'' using audio and visual cues. In this paper we propose the combination of an audio diarisation system with psychology inspired visual features, reporting experiments on multiparty meetings, a challenging domain characterised by unconstrained interaction and participant movements. More precisely the role of gaze in coordinating speaker turns was exploited by the use of Visual Focus of Attention features. Experiments were performed both with the reference and 3 automatic VFoA estimation systems, based on head pose and visual activity cues, of increasing complexity. VFoA features yielded consistent speaker diarisation improvements in combination with audio features using a multi-stream approach.},
projects = {Idiap,
AMIDA,
IM2},
}

@article{eth_biwi_00461,
title = {Smart particle filtering for high-dimensional tracking},
author = {Bray, M.  and Koller-Meier, E.  and van Gool, L. },
journal = {Computer Vision and Image Understanding},
year = {2007},
keywords = {Report_VI, IM2.VP, stochastic meta-descent, importance sampling, smart particle filter, hand tracking}
}

@article{eth_biwi_00545,
title = {Recognizing emotions expressed by body pose: a biologically inspired neural model},
author = {Schindler, K.  and van Gool, L.  and {B. de Gelder} },
journal = {Neural Networks},
year = {2008},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{eth_biwi_00399,
title = {Procedural modeling of buildings},
author = {Müller, P.  and Wonka, P.  and Haegler, S.  and Ulmer, A.  and van Gool, L. },
booktitle = {Proceedings of ACM SIGGRAPH 2006 / ACM Transactions on Graphics},
year = {2006},
volume = {25},
pages = {614--623},
publisher = {ACM Press},
keywords = {Report_VI, IM2.VP, Procedural Modeling, Architecture, Chomsky Grammars, L-systems, Computer-Aided Design}
}

@inproceedings{eth_biwi_00468,
title = {Dense stereo by triangular meshing and cross validation},
author = {Wey, P.  and Fischer, B.  and Bay, H.  and Buhmann, J. M. },
booktitle = {DAGM-Symposium},
year = {2006},
pages = {708--717},
keywords = {Report_VI, IM2.VP}
}

@inproceedings{eth_biwi_00541,
title = {Accurate and robust registration for in-hand modeling},
author = {Weise, T.  and Leibe, B.  and van Gool, L. },
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR'08)},
year = {2008},
keywords = {Report_VII, IM2.VP}
}

@article{eth_biwi_00540,
title = {3d urban scene modeling integrating recognition and reconstruction},
author = {Cornelis, N.  and Leibe, B.  and Cornelis, K.  and van Gool, L. },
journal = {International Journal of Computer Vision},
year = {2008},
volume = {78},
number = {2-3},
pages = {121--141},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{eth_biwi_00543,
title = {A mobile vision system for robust multi-person tracking},
author = {Ess, A.  and Leibe, B.  and Schindler, K.  and van Gool, L. },
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR'08)},
year = {2008},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{eth_biwi_00542,
title = {Measuring camera translation by the dominant apical angle},
author = {Torii, A.  and Havlena, M.  and Pajdla, T.  and {B. Leibe} },
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR'08)},
year = {2008},
keywords = {Report_VII, IM2.VP}
}

@article{Baker:ISPM:2009a,
title = {Research developments and directions in speech recognition and understanding},
author = {Baker, J.  and Deng, L.  and Glass, J.  and Khudanpur, S.  and Lee, C. -H.  and Morgan, N.  and O'Shgughnessy, D. },
journal = {IEEE Signal Processing Magazine},
year = {2009},
volume = {26},
number = {3},
pages = {75--80},
keywords = {IM2.AP, Report_VIII}
}

@inproceedings{Pronobis_ICRA2008_2008,
title = {SVM-based Discriminative Accumulation Scheme for Place Recognition},
author = {Pronobis, A.  and Martinez Monos, O.  and Caputo, B. },
booktitle = {Proceedings of the IEEE International Conference on Robotics and Automation (ICRA08)},
year = {2008},
keywords = {IM2.MPR, Report_VIII},
abstract = {ntegrating information coming from different sensors is a fundamental capability for autonomous robots. For complex tasks like topological localization, it would be desirable to use multiple cues, possibly from different modalities, so to achieve robust performance. This paper proposes a new method for integrating multiple cues. For each cue we train a large margin classifier which outputs a set of scores indicating the confidence of the decision. These scores are then used as input to a Support Vector Machine, that learns how to weight each cue, for each class, optimally during training. We call this algorithm SVM-based Discriminative Accumulation Scheme (SVM-DAS). We applied our method to the topological localization task, using vision and laser-based cues. Experimental results clearly show the value of our approach.},
projects = {Idiap,
DIRAC},
}

@article{gillick08,
title = {Unsupervised learning of edit parameters for matching name variants},
author = {Gillick, D.  and Hakkani-Tur, D.  and Levit, M. },
journal = {to appear in Proceedings of Interspeech 2008, Brisbane, Australia},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@inproceedings{Ganapathy_TSD2009-2_2009,
title = {Error Resilient Speech Coding Using Sub-band Hilbert Envelopes},
author = {Ganapathy, S.  and Motlicek, P.  and Hermansky, H. },
crossref = {Ganapathy_TSD2009_2009},
booktitle = {12th International Conference on Text, Speech and Dialogue, TSD 2009},
series = {LNAI 5729},
year = {2009},
pages = {355--362},
publisher = {Springer - Verlag, Berlin Heidelberg 2009},
location = {Pilsen, Czech Republic},
isbn = {3-642-04207-4 Springer Berlin He},
issn = {0302-9743},
keywords = {IM2.VP, Report_VIII},
abstract = {Frequency Domain Linear Prediction (FDLP) represents a technique for auto-regressive modelling of Hilbert envelopes of a signal. In this paper, we propose a speech coding technique that uses FDLP in Quadrature Mirror Filter (QMF) sub-bands of short segments of the speech signal (25 ms). Line Spectral Frequency parameters related to au-toregressive models and the spectral components of the residual signals are transmitted. For simulating the effects of lossy transmission channels, bit-packets are dropped randomly. In the objective and subjective quality evaluations, the proposed FDLP speech codec is judged to be more resilient to bit-packet losses compared to the state-of-the-art Adaptive Multi-Rate Wide-Band (AMR-WB) codec at 12 kbps.},
projects = {Idiap},
}

@misc{Soleymani:MLMI08,
title = {valence-arousal representation of movie scenes based on multimedia content analysis and user's physiological emotional responses},
author = {Soleymani, M.  and Chanel, G.  and Kierkels, J.  and Pun, T. },
year = {2008},
howpublished = {5th Joint Workshop on Machine Learning and Multimodal Interaction},
keywords = {IM2.MCA, Report_VIII},
owner = {Soleymani},
vgclass = {refpap},
vgproject = {bmi}
}

@inproceedings{Garner_ASRU_2009,
title = {SNR Features for Automatic Speech Recognition},
author = {Garner, P. N. },
crossref = {Garner_Idiap-RR-25-2009},
booktitle = {Proceedings of the IEEE workshop on Automatic Speech Recognition and Understanding},
year = {2009},
location = {Merano, Italy},
keywords = {IM2.AP, Report_VIII},
abstract = {When combined with cepstral normalisation techniques, the features normally used in Automatic Speech Recognition are based on Signal to Noise Ratio (SNR). We show that calculating SNR from the outset, rather than relying on cepstral normalisation to produce it, gives features with a number of practical and mathematical advantages over power-spectral based ones. In a detailed analysis, we derive Maximum Likelihood and Maximum a-Posteriori estimates for SNR based features, and show that they can outperform more conventional ones, especially when subsequently combined with cepstral variance normalisation. We further show anecdotal evidence that SNR based features lend themselves well to noise estimates based on low-energy envelope tracking.},
projects = {IM2},
}

@inproceedings{eth_biwi_00499,
title = {Efficient mining of frequent and distinctive feature configurations},
author = {Quack, T.  and Ferrari, V.  and Leibe, B.  and van Gool, L. },
booktitle = {International Conference on Computer Vision (ICCV'07)},
year = {2007},
keywords = {Report_VII, IM2.MCA}
}

@inproceedings{eth_biwi_00498,
title = {Depth and appearance for mobile scene analysis},
author = {Ess, A.  and Leibe, B.  and van Gool, L. },
booktitle = {International Conference on Computer Vision (ICCV'07)},
year = {2007},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{eth_biwi_00497,
title = {Coupled detection and trajectory estimation for multi-object tracking},
author = {Leibe, B.  and Schindler, K.  and van Gool, L. },
booktitle = {International Conference on Computer Vision (ICCV'07)},
year = {2007},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{eth_biwi_00496,
title = {Simultaneous segmentation and 3d reconstruction of monocular image sequences},
author = {Ozden, K. E.  and Schindler, K.  and van Gool, L. },
booktitle = {International Conference on Computer Vision (ICCV'07)},
year = {2007},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{eth_biwi_00492,
title = {Generalised linear pose estimation},
author = {Ess, A.  and Neubeck, A.  and van Gool, L. },
booktitle = {BMVC},
year = {2007},
note = {in press},
keywords = {Report_VI, IM2.VP}
}

@incollection{eth_biwi_00490,
title = {The 2005 pascal visual object class challenge},
author = {Everingham, M.  and Zisserman, A.  and Williams, C.  and van Gool, L.  and Allan, M.  and Bishop, C.  and Chapelle, O.  and Dalal, N.  and Deselaers, T.  and Dorko, G.  and Duffner, S.  and Eichhorn, J.  and Farquhar, J.  and Fritz, M.  and Garcia, C.  and Griffiths, T.  and Jurie, F.  and Keysers, D.  and Koskela, M.  and Laaksonen, J.  and Larlus, D.  and Leibe, B.  and Meng, H.  and Ney, H.  and Schiele, B.  and Schmid, C.  and Seemann, E.  and Shawe-Taylor, J.  and Storkey, A.  and Szedmak, S.  and Triggs, B.  and Ulusoy, I.  and Viitaniemi, V.  and Zhang, J. },
booktitle = {Selected Proceedings of the 1st PASCAL Challenges Workshop, Lecture Notes in AI},
year = {2006},
publisher = {Springer},
keywords = {Report_VI, IM2.VP}
}

@inproceedings{KryszDryg4,
title = {What do quality measures predict in biometrics},
author = {Kryszczuk, K.  and Drygajlo, A. },
booktitle = {16th European Signal Processing Conference},
year = {2008},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{AMR2007Kludasa,
title = {Automatic image annotation with relevance feedback and latent semantic analysis},
author = {Morrison, D.  and Marchand-Maillet, S.  and Bruno, E. },
booktitle = {Workshop on Adaptive Multimedia Retrieval (AMR 2007)},
year = {2007},
keywords = {Report_VI, IM2.MCA}
}

@article{KryszDryg1,
title = {Improving biometric verification with class-independent quality information},
author = {Kryszczuk, K.  and Drygajlo, A. },
journal = {IET Signal Processing, Special Issue on Biometric Recognition},
year = {2009},
volume = {3},
number = {4},
pages = {310--321},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{KryszDryg3,
title = {On quality of quality measures for classification},
author = {Kryszczuk, K.  and Drygajlo, A. },
booktitle = {Biometrics and Identity Management, Lecture Notes in Computer Science 5372},
year = {2008},
pages = {19--28},
keywords = {IM2.MPR, Report_VIII}
}

@article{hillard06,
title = {Impact of Automatic Comma Prediction on POS/Name Tagging of Speech},
author = {Hillard, D.  and Huang, Z.  and Ji, H.  and Grishman, R.  and Hakkani-Tur, D.  and Harper, M.  and Ostendorf, M.  and Wang, W. },
journal = {Proc. IEEE/ACL Workshop on Spoken Language Technology,},
year = {2006},
keywords = {Report_VI, IM2.AP}
}

@article{[Ortega09c],
title = {The multi-scenario multi-environment biosecure multimodal database (bmdb)},
author = {Ortega-Garcia, J.  and Fierrez, J.  and Alonso-Fernandez, F.  and Galbally, J.  and {M. R. Freire}  and Gonzalez-Rodriguez, J.  and Garcia-Mateo, C.  and Alba-Castro, J. -L.  and {E. Gonzalez-Agulla}  and {E. Otero-Muras}  and {S. Garcia-Salicetti}  and {L. Allano}  and {B. Ly-Van}  and {B. Dorizzi}  and Kittler, J.  and Bourlai, T.  and Poh, N.  and Deravi, F.  and {M. W. R. Ng}  and {M. Fairhurst}  and Hennebert, J.  and Humm, A.  and {M. Tistarelli}  and {L. Brodo}  and Richiardi, J.  and Drygajlo, A.  and {H. Ganster}  and {F. M. Sukno}  and Pavani, S. -K.  and {A. Frangi}  and {L. Akarun}  and {A. Savran} },
journal = {IEEE Trans. on Pattern Analysis and Machine Intelligence},
year = {2009},
note = {to appear},
keywords = {IM2.MPR, Report_VII}
}

@inproceedings{Pallotta2007,
title = {User requirement analysis for meeting information retrieval based on query elicitation},
author = {Pallotta, V.  and Seretan, V.  and Ailomaa, M. },
booktitle = {Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics (ACL 2007)},
year = {2007},
pages = {1008--1015},
organization = {Association for Computational Linguistics},
url = {http://liawww.epfl.ch/Publications/Archive/Pallotta2007.pdf},
keywords = {Report_VI, IM2.HMI}
}

@techreport{marcel:rr07-07,
author = {Marcel, S.  and Abbet, P.  and Guillemot, M. },
year = {2007},
type = {Idiap-Com},
number = {Idiap-Com-07-2007},
institution = {IDIAP},
keywords = {IM2.VP, Report_VI},
abstract = {This paper presents a system to retrieve and browse images from the Internet containing only one particular object of interest: the human face. This system, called Google Portrait, uses Google Image search engine to retrieve images matching a text query and filters images containing faces using a face detector. Results and ranked by portraits and a tagging module is provided to change manually the label attached to faces.}
}

@inproceedings{Hung_hb08,
title = {Associating audio-visual activity cues in a dominance estimation framework},
author = {Hung, H.  and Huang, Y.  and Yeo, C.  and Gatica-Perez, D. },
booktitle = {CVPR Workshop on Human Communicative Behavior},
year = {2008},
keywords = {Report_VII, IM2.MPR}
}

@inproceedings{Soleymani:ISM08,
title = {affective characterization of movie scenes based on multimedia content analysis and user's physiological emotional responses},
author = {Soleymani, M.  and Chanel, G.  and Kierkels, J.  and Pun, T. },
booktitle = {IEEE International Symposium on Multimedia},
year = {2008},
keywords = {IM2.MCA, Report_VIII},
owner = {Soleymani},
vgclass = {refpap},
vgproject = {bmi}
}

@phdthesis{rodrig-thesis-2006,
title = {Face detection and verification using local binary patterns},
author = {Rodriguez, Y. },
year = {2006},
type = {IDIAP-RR},
school = {\'Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {PhD Thesis #3681 at the \'Ecole Polytechnique F\'ed\'erale de Lausanne},
keywords = {Report_VI, IM2.VP, Face Detection and Verification, Boosting, Local Binary Patterns},
abstract = {This thesis proposes a robust Automatic Face Verification (AFV) system using Local Binary Patterns (LBP). AFV is mainly composed of two modules: Face Detection (FD) and Face Verification (FV). The purpose of FD is to determine whether there are any face in an image, while FV involves confirming or denying the identity claimed by a person. The contributions of this thesis are the following: 1) a real-time multiview FD system which is robust to illumination and partial occlusion, 2) a FV system based on the adaptation of LBP features, 3) an extensive study of the performance evaluation of FD algorithms and in particular the effect of FD errors on FV performance. The first part of the thesis addresses the problem of frontal FD. We introduce the system of Viola and Jones which is the first real-time frontal face detector. One of its limitations is the sensitivity to local lighting variations and partial occlusion of the face. In order to cope with these limitations, we propose to use LBP features. Special emphasis is given to the scanning process and to the merging of overlapped detections, because both have a significant impact on the performance. We then extend our frontal FD module to multiview FD. In the second part, we present a novel generative approach for FV, based on an LBP description of the face. The main advantages compared to previous approaches are a very fast and simple training procedure and robustness to bad lighting conditions. In the third part, we address the problem of estimating the quality of FD. We first show the influence of FD errors on the FV task and then empirically demonstrate the limitations of current detection measures when applied to this task. In order to properly evaluate the performance of a face detection module, we propose to embed the FV into the performance measuring process. We show empirically that the proposed methodology better matches the final FV performance.},
ipdmembership = {vision},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/rodrig-idiap-rr-06-79.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/rodrig-idiap-rr-06-79.ps.gz}
}

@article{Ganapathy_JASA-EL_2008,
title = {Modulation Frequency Features For Phoneme Recognition In Noisy Speech},
author = {Ganapathy, S.  and Thomas, S.  and Hermansky, H. },
crossref = {Ganapathy_Idiap-RR-70-2008},
journal = {Journal of Acoustical Society of America - Express Letters},
year = {2008},
keywords = {IM2.AP, Report_VIII},
abstract = {In this letter, a new feature extraction technique based on modulation spectrum derived from syllable-length segments of sub-band temporal envelopes is proposed. These sub-band envelopes are derived from auto-regressive modelling of Hilbert envelopes of the signal in critical bands, processed by both a static (logarithmic) and a dynamic (adaptive loops) compression. These features are then used for machine recognition of phonemes in telephone speech. Without degrading the performance in clean conditions, the proposed features show significant improvements compared to other state-of-the-art speech analysis techniques. In addition to the overall phoneme recognition rates, the performance with broad phonetic classes is reported.},
projects = {Idiap,
IM2,
AMIDA},
}

@book{schlapbach08awriter,
title = {Writer identification and verification},
author = {Schlapbach, A. },
booktitle = {Dissertations in Artifical Intelligence},
year = {2008},
volume = {311},
publisher = {IOS Press},
isbn = {978-1-58603-825-0},
keywords = {IM2.VP, Report_VIII}
}

@incollection{bunke07graphMatching,
title = {Graph matching -- exact and error-tolerant methods and the automatic learning of edit costs},
author = {Bunke, H.  and Neuhaus, M. },
editor = {Cook, D. J.  and Holder, L. B. },
booktitle = {Mining Graph Data},
year = {2007},
pages = {17--34},
publisher = {Wiley},
keywords = {Report_VI, IM2.ACP}
}

@inproceedings{Kryszczuk2007QstackQuality,
title = {Q-stack: uni- and multimodal classifier stacking with quality measures},
author = {Kryszczuk, K.  and Drygajlo, A. },
booktitle = {Proc. 7th Int. Workshop on Multiple Classifier Systems},
year = {2007},
publisher = {Springer},
keywords = {Report_VI, IM2.MPR},
owner = {Andrzej}
}

@techreport{marcel:com06-06,
title = {Annotation of face detection: description of xml format and files},
author = {Marcel, S.  and Rodriguez, Y.  and Guillemot, M.  and Popescu-Belis, A. },
year = {2006},
type = {IDIAP-COM},
number = {06},
institution = {IDIAP},
keywords = {Report_VI, IM2.VP},
ipdmembership = {vision},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/marcel-idiap-com-06-06.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/marcel-idiap-com-06-06.ps.gz}
}

@inproceedings{Parthasarathi_ICMI-MLMI2009_2009,
title = {Speaker Change Detection with Privacy-Preserving Audio Cues},
author = {Parthasarathi, S. H. K.  and Magimai-Doss, M.  and Gatica-Perez, D.  and Bourlard, H. },
crossref = {Parthasarathi_Idiap-RR-23-2009},
booktitle = {Proceedings of ICMI-MLMI 2009},
year = {2009},
keywords = {IM2.AP, Report_VIII},
abstract = {In this paper we investigate a set of privacy-sensitive audio features for speaker change detection (SCD) in multiparty conversations. These features are based on three different principles: characterizing the excitation source information using linear prediction residual, characterizing subband spectral information shown to contain speaker information, and characterizing the general shape of the spectrum. Experiments show that the performance of the privacy-sensitive features is comparable or better than that of the state-of-the-art full-band spectral-based features, namely, mel frequency cepstral coefficients, which suggests that socially acceptable ways of recording conversations in real-life is feasible.},
projects = {Idiap,
IM2,
SNSF-MULTI},
}

@inproceedings{Soleymani:WMS08,
title = {affective ranking of movie scenes using physiological signals and content analysis},
author = {Soleymani, M.  and Chanel, G.  and Kierkels, J.  and Pun, T. },
booktitle = {2nd ACM Workshop on the Many Faces of Multimedia Semantics, ACM MM08},
year = {2008},
keywords = {IM2.MCA, Report_VIII},
owner = {Soleymani},
vgclass = {refpap},
vgproject = {bmi}
}

@article{Rakotomamonjy_JMLR_2008,
title = {SimpleMKL},
author = {Rakotomamonjy, A.  and Bach, F.  and Canu, S.  and Grandvalet, Y. },
journal = {Journal of Machine Learning Research},
year = {2008},
volume = {9},
pages = {2491--2521},
keywords = {IM2.MPR, Report_VIII},
abstract = {Multiple kernel learning (MKL) aims at simultaneously learning a kernel and the associated predictor in supervised learning settings. For the support vector machine, an efficient and general multiple kernel learning algorithm, based on semi-infinite linear progamming, has been recently proposed. This approach has opened new perspectives since it makes MKL tractable for large-scale problems, by iteratively using existing support vector machine code. However, it turns out that this iterative algorithm needs numerous iterations for converging towards a reasonable solution. In this paper, we address the MKL problem through a weighted 2-norm regularization formulation with an additional constraint on the weights that encourages sparse kernel combinations. Apart from learning the combination, we solve a standard SVM optimization problem, where the kernel is defined as a linear combination of multiple kernels. We propose an algorithm, named SimpleMKL, for solving this MKL problem and provide a new insight on MKL algorithms based on mixed-norm regularization by showing that the two approaches are equivalent. We show how SimpleMKL can be applied beyond binary classification, for problems like regression, clustering (one-class classification) or multiclass classification. Experimental results show that the proposed algorithm converges rapidly and that its efficiency compares favorably to other MKL algorithms. Finally, we illustrate the usefulness of MKL for some regressors based on wavelet kernels and on some model selection problems related to multiclass classification problems.},
projects = {Idiap},
}

@inproceedings{voloshynovskiy2,
title = {On security threats for robust perceptual hashin},
author = {Koval, O.  and Voloshynovskiy, S.  and Caire, F.  and Bas, P. },
booktitle = {Electronic Imaging 2009},
year = {2009},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{kludas2008:ipta,
title = {Exploiting document feature interactions for efficient information fusion in high dimensional spaces},
author = {Kludas, J.  and Marchand-Maillet, S.  and Bruno, E. },
booktitle = {Proceedings of the First International Workshops on Image Processing Theory, Tools and Applications (IPTA'2008)},
year = {2008},
note = {(invited)},
url = {http://viper.unige.ch/documents/pdf/kludas2008-ipta.pdf},
keywords = {IM2.MCA, Report_VIII}
}

@article{eth_biwi_00539,
title = {Robust object detection with interleaved categorization and segmentation},
author = {Leibe, B.  and Leonardis, A.  and Schiele, B. },
journal = {International Journal of Computer Vision},
year = {2008},
volume = {77},
number = {1-3},
pages = {259--289},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{jayagopi:acmmm:2008,
title = {Predicting the dominant clique in meetings through fusion of nonverbal cues},
author = {Jayagopi, D.  and Hung, H.  and Yeo, C.  and Gatica-Perez, D. },
crossref = {jayagopi:rr08-08},
booktitle = {ACM MM 2008},
year = {2008},
note = {IDIAP-RR 08-08},
keywords = {IM2.MCA, Report_VII},
abstract = {This paper addresses the problem of automatically predicting the dominant clique (i.e., the set of K-dominant people) in face-to-face small group meetings recorded by multiple audio and video sensors. For this goal, we present a framework that integrates automatically extracted nonverbal cues and dominance prediction models. Easily computable audio and visual activity cues are automatically extracted from cameras and microphones. Such nonverbal cues, correlated to human display and perception of dominance, are well documented in the social psychology literature. The effectiveness of the cues were systematically investigated as single cues as well as in unimodal and multimodal combinations using unsupervised and supervised learning approaches for dominant clique estimation. Our framework was evaluated on a five-hour public corpus of teamwork meetings with third-party manual annotation of perceived dominance. Our best approaches can exactly predict the dominant clique with 80.8\% accuracy in four-person meetings in which multiple human annotators agree on their judgments of perceived dominance.}
}

@phdthesis{Grangier_THESIS_2008,
title = {Machine Learning for Information Retrieval},
author = {Grangier, D. },
crossref = {grangier:2008:rr_08-34},
year = {2008},
school = {\'Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {Th\ese Ecole polytechnique f\'ed\'erale de Lausanne EPFL, no 4088 (2008), Facult\'e des sciences et techniques de l'ing\'enieur STI, Section de g\'enie \'electrique et \'electronique, Institut de g\'enie \'electrique et \'electronique IEL (Laboratoire de l'IDIAP LIDIAP). Dir.: Herv\'e Bourlard, Sami Bengio},
keywords = {discriminative learning, image retrieval, Information Retrieval, learning to rank, machine learning, online learning, spoken keyword spotting, text retrieval, IM2.AP,Report_VIII},
projects = {Idiap},
}

@article{Chanel:2009:IJHCS,
title = {short-term emotion assessment in a recall paradigm},
author = {Chanel, G.  and Kierkels, J.  and Soleymani, M.  and Pun, T. },
journal = {International Journal of Human-Computer Studies},
year = {2009},
volume = {67},
number = {8},
pages = {607--627},
note = {DOI: http://dx.doi.org/10.1016/j.ijhcs.2009.03.005},
url = {http://dx.doi.org/10.1016/j.ijhcs.2009.03.005},
keywords = {IM2.MCA, Report_VIII},
owner = {Chanel},
vgclass = {refpap},
vgproject = {bmi}
}

@article{Pronobis_IJRR_2009,
title = {COLD: The COsy Localization Database},
author = {Pronobis, A.  and Caputo, B. },
journal = {International Journal of Robotics Research},
year = {2009},
volume = {28},
number = {5},
pages = {588--594},
keywords = {IM2.DMA, Report_VIII},
projects = {Idiap},
}

@article{cuendet06,
title = {Model Adaptation for Sentence Segmentation from Speech},
author = {Cuendet, S.  and Hakkani-Tur, D.  and Tur, G. },
journal = {Proc. IEEE/ACL Workshop on Spoken Language Technology,},
year = {2006},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{liwicki07combining,
title = {Combining on-line and off-line systems for handwriting recognition},
author = {Liwicki, M.  and Bunke, H. },
booktitle = {Proc. 9th Int. Conf. on Document Analysis and Recognition},
year = {2007},
pages = {372--376},
isbn = {978-0-7695-2822-9},
keywords = {Report_VII, IM2.VP},
peer = {yes}
}

@inproceedings{paiement:ICML:2008,
title = {A Distance Model for Rhythms},
author = {Paiement, J. -F.  and Grandvalet, Y.  and Bengio, S.  and Eck, D. },
crossref = {paiement:rr08-33},
booktitle = {25th International Conference on Machine Learning (ICML)},
year = {2008},
note = {IDIAP-RR 08-33},
keywords = {IM2.AP, Report_VIII},
abstract = {Modeling long-term dependencies in time series has proved very difficult to achieve with traditional machine learning methods. This problem occurs when considering music data. In this paper, we introduce a model for rhythms based on the distributions of distances between subsequences. A specific implementation of the model when considering Hamming distances over a simple rhythm representation is described. The proposed model consistently outperforms a standard Hidden Markov Model in terms of conditional prediction accuracy on two different music databases.},
ipdmembership = {learning},
projects = {Idiap},
pdf = {ftp://ftp.idiap.ch/pub/papers/2008/paiement-ICML-2008.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2008/paiement-ICML-2008.ps.gz}
}

@inproceedings{Vinciarelli_ICMI2008_2008,
title = {Social signals, their function, and automatic analysis: a survey},
author = {Vinciarelli, A.  and Pantic, M.  and Bourlard, H.  and Pentland, A. },
booktitle = {Proceedings of International Conference on Multimodal Interfaces (to appear)},
year = {2008},
keywords = {IM2.MCA, Report_VII},
abstract = {Social Signal Processing (SSP) aims at the analysis of social behaviour in both Human-Human and Human-Computer interactions. SSP revolves around automatic sensing and interpretation of social signals, complex aggregates of nonverbal behaviours through which individuals express their attitudes towards other human (and virtual) participants in the current social context. As such, SSP integrates both engineering (speech analysis, computer vision, etc.) and human sciences (social psychology, anthropology, etc.) as it requires multimodal and multidisciplinary approaches. As of today, SSP is still in its early infancy, but the domain is quickly developing, and a growing number of works is appearing in the literature. This paper provides an introduction to nonverbal behaviour involved in social signals and a survey of the main results obtained so far in SSP. It also outlines possibilities and challenges that SSP is expected to face in the next years if it is to reach its full maturity.}
}

@inproceedings{frinken09evaluating,
title = {Evaluating retraining rules for semi-supervised learning in neural network based cursive word recognition},
author = {Frinken, V.  and Bunke, H. },
booktitle = {Proc. 10th Int. Conf. on Document Analysis and Recognition},
year = {2009},
volume = {1},
pages = {31--35},
isbn = {978-0-7695-3725-2},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@inproceedings{pinto:asru-phnrecog:2007,
title = {Significance of contextual information in phoneme recognition},
author = {Pinto, J. P.  and R. M., P.  and Yegnanarayana, B.  and Hermansky, H. },
year = {2007},
note = {IDIAP-RR 07-28},
keywords = {Report_VI, IM2.AP},
ipdmembership = {speech},
ipdxref = {techreport:pinto-idiap-rr-07-28.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/pinto-asru-phnrecog-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/pinto-asru-phnrecog-2007.ps.gz}
}

@inproceedings{liwicki07genderDetection,
title = {Automatic detection of gender and handedness from on-line handwriting},
author = {Liwicki, M.  and Schlapbach, A.  and Loretan, P.  and Bunke, H. },
booktitle = {Proc. 13th Conf. of the Graphonomics Society},
year = {2007},
pages = {179--183},
isbn = {978-0-7326-4003-3},
keywords = {Report_VII, IM2.VP},
peer = {yes}
}

@phdthesis{Aradilla_THESIS_2008,
title = {Acoustic models for posterior features in speech recognition},
year = {2008},
school = {Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {PhD Thesis no 4164},
keywords = {IM2.AP, Report_VII},
abstract = {In this thesis, we investigate the use of posterior probabilities of sub-word units directly as input features for automatic speech recognition (ASR). These posteriors, estimated from data-driven methods, display some favourable properties such as increased speaker invariance, but unlike conventional speech features also hold some peculiarities, such that their components are non-negative and sum up to one. State-of-the-art acoustic models for ASR rely on general-purpose similarity measures like Euclidean-based distances or likelihoods computed from Gaussian mixture models (GMMs), hence, they do not explicitly take into account the particular properties of posterior-based speech features. We explore here the use of the Kullback-Leibler (KL) divergence as similarity measure in both non-parametric methods using templates and parametric models that rely on an architecture based on hidden Markov models (HMMs). Traditionally, template matching (TM)-based ASR uses cepstral features and requires a large number of templates to capture the natural variability of spoken language. Thus, TM-based approaches are generally oriented to speaker-dependent and small vocabulary recognition tasks. In our work, we use posterior features to represent the templates and test utterances. Given the discriminative nature of posterior features, we show that a limited number of templates can accurately characterize a word. Experiments on different databases show that using KL divergence as local similarity measure yields significantly better performance than traditional TM-based approaches. The entropy of posterior features can also be used to further improve the results. In the context of HMMs, we propose a novel acoustic model where each state is parameterized by a reference multinomial distribution and the state score is based on the KL divergence between the reference distribution and the posterior features. Besides the fact that the KL divergence is a natural dissimilarity measure between posterior distributions, we further motivate the use of the KL divergence by showing that the proposed model can be interpreted in terms of maximum likelihood and information theoretic clustering. Furthermore, the KL-based acoustic model can be seen as a general case of other known acoustic models for posterior features such as hybrid HMM/MLP and discrete HMM. The presented approach has been extended to large vocabulary recognition tasks. When compared to state-of-the-art HMM/GMM, the KL-based acoustic model yields comparable results while using significantly fewer parameters.}
}

@inproceedings{voloshynovskiy2a,
title = {Binary robust hashing based on probabilistic bit reliability},
author = {Voloshynovskiy, S.  and Koval, O.  and Beekhof, F.  and Holotyak, T. },
booktitle = {IEEE Workshop on Statistical Signal Processing 2009},
year = {2009},
keywords = {IM2.MPR, Report_VIII}
}

@article{Evequoz20079,
title = {Indexing and visualizing digital memories through personal email archive},
author = {Ev\'equoz, F.  and Lalanne, D. },
year = {2007},
pages = {21--24},
keywords = {Report_VII, IM2.HMI}
}

@inproceedings{vinciarelli:icmevincia:2007,
title = {Semantic segmentation of radio programs using social network analysis and duration distribution modeling},
author = {Vinciarelli, A.  and Fern\andez, F.  and Favre, S. },
booktitle = {IEEE International Conference on Multimedia and Expo (ICME)},
year = {2007},
note = {IDIAP-RR 06-75},
keywords = {Report_VI, IM2.AP.MPR, joint publication},
abstract = {This work presents and compare two approaches for the semantic segmentation of broadcast news: the first is based on Social Network Analysis, the second is based on Poisson Stochastic Processes. The experiments are performed over 27 hours of material: preliminary results are obtained by addressing the problem of splitting different episodes of the same program into two parts corresponding to a news bulletin and a talk-show respectively. The results show that the transition point between the two parts can be detected with an average error of around three minutes, i.e. roughly 5 percent of each episode duration.},
ipdmembership = {vision},
ipdxref = {techreport:vinciarelli-idiap-rr-06-75.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/vinciarelli-icmevincia-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/vinciarelli-icmevincia-2007.ps.gz}
}

@inproceedings{Berclaz_ECCV_2008,
title = {Multi-camera tracking and atypical motion detection with behavioral maps},
author = {Berclaz, J.  and Fleuret, F.  and Fua, P. },
booktitle = {The 10th European Conference on Computer Vision (ECCV 2008)},
year = {2008},
location = {Marseille, France},
keywords = {IM2.MPR, Report_VII},
abstract = {We introduce a novel behavioral model to describe pedestrians motions, which is able to capture sophisticated motion patterns resulting from the mixture of different categories of random trajectories. Due to its simplicity, this model can be learned from video sequences in a totally unsupervised manner through an Expectation-Maximization procedure. When integrated into a complete multi-camera tracking system, it improves the tracking performance in ambiguous situations, compared to a standard ad-hoc isotropic Markovian motion model. Moreover, it can be used to compute a score which characterizes atypical individual motions. Experiments on outdoor video sequences demonstrate both the improvement of tracking performance when compared to a state-of-the-art tracking system and the reliability of the atypical motion detection.}
}

@article{vergyri08,
title = {Development of the sri/nightingale arabic asr system},
author = {Vergyri, D.  and Mandal, A.  and Wang, W.  and Stolcke, A.  and Zheng, J.  and Graciarena, M.  and Rybach, D.  and Gollan, C.  and Schlater, R.  and Kirchoff, K.  and Faria, A.  and Morgan, N. },
journal = {to appear in Proceedings of Interspeech 2008, Brisbane, Australia},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@book{DenisLalanne20093,
title = {Human machine interaction},
author = {Lalanne, D.  and Kholas, J. },
year = {2009},
institution = {Department of Informatics, University of Fribourg, Switzerland},
keywords = {IM2.HMI, Report_VIII}
}

@inproceedings{Mueller:2007:PMF,
title = {Image-based procedural modeling of facades},
author = {Müller, P.  and Zeng, G.  and Wonka, P.  and van Gool, L. },
booktitle = {Proceedings of ACM SIGGRAPH 2007 / ACM Transactions on Graphics},
year = {2007},
volume = {26},
number = {3},
publisher = {ACM Press},
address = {New York, NY, USA},
keywords = {Report_VI, IM2.VP}
}

@techreport{lperruchoud:com-08-02,
title = {The Anterior Cingulate Cortex},
author = {Perruchoud, L. },
year = {2008},
month = {April},
type = {Idiap-Com},
number = {Idiap-Com-02-2008},
institution = {IDIAP},
keywords = {IM2.MPR,Report_VIII},
ipdmembership = {learning},
projects = {Idiap},
pdf = {ftp://ftp.idiap.ch/pub/reports/2008/lperruchoud-idiap-com-08-02.pdf}
}

@inproceedings{Yao_ECCV-VS_2008,
title = {Fast human detection from videos using covariance features},
author = {Yao, J.  and Odobez, J. -M. },
crossref = {Yao_Idiap-RR-68-2007},
booktitle = {European Conference on Computer Vision, workshop on Visual Surveillance (ECCV-VS)},
year = {2008},
location = {Marseille},
keywords = {IM2.VP, Report_VIII},
abstract = {In this paper, we present a fast method to detect humans from videos captured in surveillance applications. It is based on a cascade of LogitBoost classifiers relying on features mapped from the Riemanian manifold of region covariance matrices computed from input image features. The method was extended in several ways. First, as the mapping process is slow for high dimensional feature space, we propose to select weak classifiers based on subsets of the complete image feature space. In addition, we propose to combine these sub-matrix covariance features with the means of the image features computed within the same subwindow, which are readily available from the covariance extraction process. Finally, in the context of video acquired with stationary cameras, we propose to fuse image features from the spatial and temporal domains in order to jointly learn the correlation between appearance and foreground information based on background subtraction. Our method evaluated on a large set of videos coming from several databases (CAVIAR, PETS, ...), and can process from 5 to 20 frames/sec (for a 384x288 video) while achieving similar or better performance than existing methods.},
projects = {Idiap,
CARETAKER},
}

@incollection{varga08perturbation,
title = {Perturbation models for generating synthetic training data in handwriting recognition},
author = {Varga, T.  and Bunke, H. },
editor = {Marinai, S.  and Fujisawa, H. },
booktitle = {Machine Learning in Document Analysis and Recognition},
year = {2008},
pages = {333--360},
publisher = {Springer},
keywords = {Report_VII, IM2.VP},
peer = {yes}
}

@article{Bogdanova2006_1523/LTS,
title = {Scale-space analysis and active contours for omnidirectional images},
author = {Bogdanova, I.  and Bresson, X.  and Thiran, J. -Ph.  and Vandergheynst, P. },
journal = {IEEE Transactions on Image Processing},
year = {2007},
volume = {16},
number = {7},
pages = {1888--1901},
doi = {na},
keywords = {Report_VII, IM2.VP, joint publication, active contour; catadioptric camera; computer vision; LTS2; LTS5; omnidirection vision; scale-space; segmentation}
}

@inproceedings{SAMT2007,
title = {Hierarchical long-term learning for automatic image},
author = {Morrison, D.  and Marchand-Maillet, S.  and Bruno, E. },
booktitle = {International Conference on Semantics And digital Media Technologies (SAMT 2007)},
year = {2007},
keywords = {Report_VI, IM2.MCA}
}

@inproceedings{berclaz-et-al-2008,
title = {Multi-camera tracking and atypical motion detection with behavioral maps},
author = {Berclaz, J.  and Fleuret, F.  and Fua, P. },
booktitle = {Proceedings of the European Conference on Computer Vision (ECCV)},
year = {2008},
pages = {112--125},
keywords = {IM2.VP, Report_VIII}
}

@techreport{paiement:rr08-51,
title = {Predictive Models for Music},
author = {Paiement, J. -F.  and Grandvalet, Y.  and Bengio, S. },
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-51-2008},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {IM2.AP, Report_VIII},
abstract = {Modeling long-term dependencies in time series has proved very difficult to achieve with traditional machine learning methods. This problem occurs when considering music data. In this paper, we introduce generative models for melodies. We decompose melodic modeling into two subtasks. We first propose a rhythm model based on the distributions of distances between subsequences. Then, we define a generative model for melodies given chords and rhythms based on modeling sequences of Narmour features. The rhythm model consistently outperforms a standard Hidden Markov Model in terms of conditional prediction accuracy on two different music databases. Using a similar evaluation procedure, the proposed melodic model consistently outperforms an Input/Output Hidden Markov Model. Furthermore, sampling these models given appropriate musical contexts generates realistic melodies.},
ipdmembership = {learning},
projects = {Idiap},
pdf = {ftp://ftp.idiap.ch/pub/reports/2008/paiement-idiap-rr-08-51.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2008/paiement-idiap-rr-08-51.ps.gz}
}

@techreport{paiement:rr08-50,
title = {Probabilistic Models for Melodic Prediction},
author = {Paiement, J. -F.  and Bengio, S.  and Eck, D. },
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-50-2008},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {IM2.AP, Report_VIII},
abstract = {Chord progressions are the building blocks from which tonal music is constructed. The choice of a particular representation for chords has a strong impact on statistical modeling of the dependence between chord symbols and the actual sequences of notes in polyphonic music. Melodic prediction is used in this paper as a benchmark task to evaluate the quality of four chord representations using two probabilistic model architectures derived from Input/Output Hidden Markov Models (IOHMMs).},
ipdmembership = {learning},
projects = {Idiap},
pdf = {ftp://ftp.idiap.ch/pub/reports/2008/paiement-idiap-rr-08-50.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2008/paiement-idiap-rr-08-50.ps.gz}
}

@inproceedings{Vinciarelli_IEEEICME_2009,
title = {Implicit Human Centered Tagging},
author = {Vinciarelli, A.  and Suditu, N.  and Pantic, M. },
booktitle = {Proceedings of IEEE Conference on Multimedia and Expo},
year = {2009},
pages = {1428--1431},
keywords = {IM2.MCA, Report_VIII},
abstract = {This paper provides a general introduction to the concept of Implicit Human-Centered Tagging (IHCT) - the automatic extraction of tags from nonverbal behavioral feedback of media users. The main idea behind IHCT is that nonverbal behaviors displayed when interacting with multimedia data (e.g., facial expressions, head nods, etc.) provide information useful for improving the tag sets associated with the data. As such behaviors are displayed naturally and spontaneously, no effort is required from the users, and this is why the resulting tagging process is said to be ''implicit''. Tags obtained through IHCT are expected to be more robust than tags associated with the data explicitly, at least in terms of: generality (they make sense to everybody) and statistical reliability (all tags will be sufficiently represented). The paper discusses these issues in detail and provides an overview of pioneering efforts in the field.},
projects = {EMMA,
IM2,
SSPNet},
}

@techreport{pinto:rr07-32,
title = {Comparing different word lattice rescoring approaches towards keyword spotting},
author = {Pinto, J. P.  and Bourlard, H.  and Graves, A.  and Hermansky, H. },
year = {2007},
type = {Idiap-RR-32-2007},
number = {32},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {IM2.AP, Report_VII},
abstract = {In this paper, we further investigate the large vocabulary continuous speech recognition approach to keyword spotting. Given a speech utterance, recognition is performed to obtain a word lattice. The posterior probability of keyword hypotheses in the lattice is computed and used to derive a confidence measure to accept/reject the keyword. We extend this framework and replace the acoustic likelihoods in the lattice obtained from a Gaussian mixture model (GMM) with likelihoods derived from a multilayered perceptron (MLP). We compare the two rescoring techniques on the conversational telephone speech database distributed by NIST for the spoken term detection evaluation. Experimental results show that GMM lattices still perform better than the rescored lattices for short and medium length keywords, but on longer keywords, the MLP rescored word lattices perform slightly better.}
}

@article{Besson2006_1508/LTS,
title = {Extraction of audio features specific to speech production for multimodal speaker detection},
author = {Besson, P.  and Popovici, V.  and Vesin, J. M.  and Thiran, J. -Ph.  and Kunt, M. },
journal = {IEEE Transactions on Multimedia},
year = {2007},
doi = {na},
keywords = {Report_VI, LTS1; LTS5; speaker detection; multimodal; feature extraction; besson p.; IM2.MPR},
details = {http://infoscience.epfl.ch/search.py?recid=91017},
oai-id = {oai:infoscience.epfl.ch:91017},
oai-set = {article},
status = {ACCEPTED},
unit = {LTS}
}

@incollection{Wooters2008,
title = {The ICSI RT07s speaker diarization system},
author = {Wooters, C.  and Huijbregts, M. },
booktitle = {Multimodal Technologies for Perception of Humans},
year = {2008},
publisher = {Lecture Notes in Computer Science},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@inproceedings{2008-bologna-icad,
title = {Pairing colored socks and following a red serpentine with sounds of musical instruments},
author = {Bologna, G.  and Deville, B.  and Vinckenbosch, M.  and Pun, T. },
booktitle = {ICAD 08, International Conference on Auditory Displays, Paris, France, June 24--27},
year = {2008},
keywords = {IM2.MCA, Report_VIII}
}

@inproceedings{BrunoDumas20089,
title = {Demonstration : hephaistk, une bo\^{\i}te \a outils pour le prototypage d'interfaces multimodales},
author = {Dumas, B.  and Lalanne, D.  and Ingold, R. },
booktitle = {Proceedings of 20e Conf\'erence sur l'Interaction Homme-Machine (IHM 08)},
year = {2008},
pages = {215--216},
keywords = {IM2.HMI, Report_VIII}
}

@inproceedings{liwicki08combining,
title = {Combining on-line and off-line blstm networks for handwritten text line recognition},
author = {Liwicki, M.  and Bunke, H. },
booktitle = {Proc. 11th Int. Conf. on Frontiers in Handwriting Recognition},
year = {2008},
pages = {31--36},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@inproceedings{vinciarelli:acmmmvincia:2007,
title = {Broadcast news story segmentation using social network analysis and hidden markov models},
author = {Vinciarelli, A.  and Favre, S. },
booktitle = {ACM International Conference on Multimedia},
year = {2007},
pages = {261--264},
note = {IDIAP-RR 07-30},
keywords = {Report_VI, IM2.AP.MPR, joint publication},
abstract = {This paper presents an approach for the segmentation of broadcast news into stories. The main novelty of this work is that the segmentation process does not take into account the content of the news, i.e. what is said, but rather the structure of the social relationships between the persons that in the news are involved. The main rationale behind such an approach is that people interacting with each other are likely to talk about the same topics, thus social relationships are likely to be correlated to stories. The approach is based on Social Network Analysis (for the representation of social relationships) and Hidden Markov Models (for the mapping of social relationships into stories). The experiments are performed over 26 hours of radio news and the results show that a fully automatic process achieves a purity higher than 0.75.}
}

@inproceedings{Koval:2007:RPH-Security,
title = {Security analysis of robust perceptual hashing},
author = {Koval, O.  and Voloshynovskiy, S.  and Beekhof, F.  and Pun, T. },
editor = {Delp III, E. J.  and Wong, P. W.  and Dittmann, J.  and Memon, N. D. },
booktitle = {Steganography, and Watermarking of Multimedia Contents X},
series = {Proceedings of SPIE},
year = {2008},
volume = {6819},
address = {(SPIE, Bellingham, WA 2008) 681906},
keywords = {Report_VII, IM2.MPR}
}

@phdthesis{Rigamonti20081,
title = {A framework for structuring multimedia archives and for browsing efficiently through multimodal links},
author = {Rigamonti, M. },
year = {2008},
school = {University of Fribourg, Switzerland},
keywords = {Report_VII, IM2.HMI}
}

@techreport{Pronobis_Idiap-RR-73-2008,
title = {Integrating audio and vision for robust automatic gender recognition},
author = {Pronobis, M.  and Magimai-Doss, M. },
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-73-2008},
institution = {Idiap},
keywords = {IM2.MPR, Report_VIII},
abstract = {We propose a multi-modal Automatic Gender Recognition (AGR) system based on audio-visual cues and present its thorough evaluation in realistic scenarios. First, we analyze robustness of different audio and visual features under varying conditions and create two uni-modal AGR systems. Then, we build an integrated audio-visual system by fusing information from each modality at the classifier level. Our extensive studies on the BANCA corpus comprising datasets of varying complexity show that: (a) the audio-based system is more robust than the vision-based system; (b) integration of audio-visual cues yields a resilient system and improves performance in noisy conditions.},
projects = {Idiap,
AMIDA},
}

@inproceedings{Gillick02,
title = {A global optimization framework for meeting summarization},
author = {Gillick, D.  and Riedhammer, K.  and Favre, B.  and Hakkani-Tur, D. },
booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Taipei, Taiwan},
year = {2009},
keywords = {IM2.AP, Report_VIII}
}

@inproceedings{tsamuel:interspeech-2:2008,
title = {Hilbert envelope based spectro-temporal features for phoneme recognition in telephone speech},
author = {Thomas, A.  and Ganapathy, S.  and Hermansky, H. },
crossref = {tsamuel:rr08-18},
booktitle = {Interspeech 2008},
year = {2008},
location = {Brisbane, Australia},
note = {IDIAP-RR 08-18},
keywords = {IM2.AP, Report_VII},
abstract = {In this paper, we present a spectro-temporal feature extraction technique using sub-band Hilbert envelopes of relatively long segments of speech signal. Hilbert envelopes of the sub-bands are estimated using Frequency Domain Linear Prediction (FDLP). Spectral features are derived by integrating the sub-band Hilbert envelopes in short-term frames and the temporal features are formed by converting the FDLP envelopes into modulation frequency components. These are then combined at the phoneme posterior level and are used as the input features for a phoneme recognition system. In order to improve the robustness of the proposed features to telephone speech, the sub-band temporal envelopes are gain normalized prior to feature extraction. Phoneme recognition experiments on telephone speech in the HTIMIT database show significant performance improvements for the proposed features when compared to other robust feature techniques (average relative reduction of $11\%$ in phoneme error rate).}
}

@inproceedings{eth_biwi_00500,
title = {Depth-from-recognition: inferring metadata by cognitive feedback},
author = {Thomas, A.  and Ferrari, V.  and Leibe, B.  and Tuytelaars, T.  and van Gool, L. },
booktitle = {ICCV'07 Workshop on 3D Representations for Recognition},
year = {2007},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{eth_biwi_00503,
title = {Automatic pose estimation for range images on the gpu},
author = {Germann, M.  and Breitenstein, M. D.  and Park, I. K.  and Pfister, H. },
booktitle = {Sixth International Conference on 3-D Digital Imaging and Modeling (3DIM 2007)},
year = {2007},
pages = {81--90},
publisher = {IEEE Computer Society},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{Richiardi2007ReliabilityBasedVotingSchemes,
title = {Reliability-based voting schemes using modality-independent features in multi-classifier biometric authentication},
author = {Richiardi, J.  and Drygajlo, A. },
booktitle = {Proc. 7th Int. Workshop on Multiple Classifier Systems},
year = {2007},
publisher = {Springer},
keywords = {Report_VI, IM2.MPR},
owner = {Jori},
pdf = {C:_skoolworks_papers_relFusion_MCS2007submitrichiardi_53.pdf}
}

@inproceedings{kumatani:rr07-74,
title = {Adaptive beamforming with a minimum mutual information criterion},
author = {Kumatani, K.  and Mayer, H.  and Gehrig, T.  and Stoimenov, E.  and McDonough, J.  and Wölfel, M. },
year = {2007},
volume = {15},
type = {Idiap-RR},
number = {8},
pages = {2527---2541},
institution = {IDIAP},
issn = {1558-7916},
doi = {10.1109/tasl.2007.907430},
keywords = {IM2.AP, Report_VII},
abstract = {In this work, we consider an acoustic beamforming application where two speakers are simultaneously active. We construct one subband-domain beamformer in emphgeneralized sidelobe canceller (GSC) configuration for each source. In contrast to normal practice, we then jointly optimize the emphactive weight vectors of both GSCs to obtain two output signals with emphminimum mutual information (MMI). Assuming that the subband snapshots are Gaussian-distributed, this MMI criterion reduces to the requirement that the emphcross-correlation coefficient of the subband outputs of the two GSCs vanishes. We also compare separation performance under the Gaussian assumption with that obtained from several super-Gaussian probability density functions (pdfs), namely, the Laplace, $K_0$, and $Gamma$ pdfs. Our proposed technique provides effective nulling of the undesired source, but without the signal cancellation problems seen in conventional beamforming. Moreover, our technique does not suffer from the source permutation and scaling ambiguities encountered in conventional blind source separation algorithms. We demonstrate the effectiveness of our proposed technique through a series of far-field automatic speech recognition experiments on data from the emphPASCAL Speech Separation Challenge (SSC). On the SSC development data, the simple delay-and-sum beamformer achieves a word error rate (WER) of 70.4\%. The MMI beamformer under a Gaussian assumption achieves a 55.2\% WER, which is further reduced to 52.0\% with a $K_0$ pdf, whereas the WER for data recorded with a close-talking microphone is 21.6\%.}
}

@inproceedings{kumatani:rr07-73,
title = {Minimum mutual information beamforming for simultaneous active speakers},
author = {Kumatani, K.  and Mayer, H.  and Gehrig, T.  and Stoimenov, E.  and McDonough, J.  and Wölfel, M. },
booktitle = {IEEE Workshop on Automatic Speech Recognition & Understanding (ASRU)},
year = {2007},
type = {Idiap-RR},
number = {Idiap-RR-73-2007},
pages = {71--76},
institution = {IDIAP},
location = {Kyoto},
doi = {10.1109/asru.2007.4430086},
keywords = {IM2.AP, Report_VII},
abstract = {In this work, we consider an acoustic beamforming application where two speakers are simultaneously active. We construct one subband-domain beamformer in emphgeneralized sidelobe canceller (GSC) configuration for each source. In contrast to normal practice, we then jointly optimize the emphactive weight vectors of both GSCs to obtain two output signals with emphminimum mutual information (MMI). Assuming that the subband snapshots are Gaussian-distributed, this MMI criterion reduces to the requirement that the emphcross-correlation coefficient of the subband outputs of the two GSCs vanishes. We also compare separation performance under the Gaussian assumption with that obtained from several super-Gaussian probability density functions (pdfs), namely, the Laplace, $K_0$, and $Gamma$ pdfs. Our proposed technique provides effective nulling of the undesired source, but without the signal cancellation problems seen in conventional beamforming. Moreover, our technique does not suffer from the source permutation and scaling ambiguities encountered in conventional blind source separation algorithms. We demonstrate the effectiveness of our proposed technique through a series of far-field automatic speech recognition experiments on data from the emphPASCAL Speech Separation Challenge (SSC). On the SSC development data, the simple delay-and-sum beamformer achieves a word error rate (WER) of 70.4\%. The MMI beamformer under a Gaussian assumption achieves a 55.2\% WER, which is further reduced to 52.0\% with a $K_0$ pdf, whereas the WER for data recorded with a close-talking microphone is 21.6\%.}
}

@techreport{sba:rr06-42,
title = {Recognizing people's focus of attention from head poses: a study},
author = {Ba, S.  and Odobez, J. -M. },
year = {2006},
type = {IDIAP-RR},
number = {42},
institution = {IDIAP},
keywords = {Report_VI, IM2.VP},
abstract = {This paper presents a study on the recognition of the visual focus of attention (VFOA) of meeting participants based on their head pose. Contrary to previous studies on the topic, in our set-up, the potential VFOA of a person is not restricted to other meeting the participants only, but include environmental targets (including a table, a projection screen). This has two consequences. First, it increases the number of possible ambiguities in identifying the VFOA from the head pose. Secondly, in the scenario we present here, full knowledge of the head pointing direction is required to identify the VFOA. An incomplete representation of the head pointing direction (head pan only) will not suffice. In this paper, using a corpus of 8 meetings of 10 minutes average length, featuring 4 persons involved discussing statements projected on a screen, we analyze the above issues by evaluating, through numerical performance measures, the recognition of the VFOA from head pose information obtained either using a magnetic sensor device (the ground truth) or a vision based tracking system (head pose estimates). The results clearly show that in such complex but realistic situations, it is can be optimistic to believe that the recognition of the VFOA can solely be based on the head pose, as some previous studies had suggested.},
ipdmembership = {vision},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/sba-idiap-rr-06-42.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/sba-idiap-rr-06-42.ps.gz}
}

@inproceedings{Berclaz_VISAPP_2008,
title = {Principled Detection-by-classification from Multiple Views},
author = {Berclaz, J.  and Fleuret, F.  and Fua, P. },
booktitle = {proceedings of the International Conference on Computer Vision Theory and Applications},
year = {2008},
volume = {2},
pages = {375--382},
keywords = {IM2.MPR, IM2.VP, Report_VIII},
abstract = {Machine-learning based classification techniques have been shown to be effective at detecting objects in com- plex scenes. However, the final results are often obtained from the alarms produced by the classifiers through a post-processing which typically relies on ad hoc heuristics. Spatially close alarms are assumed to be triggered by the same target and grouped together. Here we replace those heuristics by a principled Bayesian approach, which uses knowledge about both the classifier response model and the scene geometry to combine multiple classification answers. We demonstrate its effectiveness for multi-view pedestrian detection. We estimate the marginal probabilities of presence of people at any location in a scene, given the responses of classifiers evaluated in each view. Our approach naturally takes into account both the occlusions and the very low metric accuracy of the classifiers due to their invariance to translation and scale. Results show our method produces one order of magnitude fewer false positives than a method that is representative of typical state-of-the-art approaches. Moreover, the framework we propose is generic and could be applied to any detection-by-classification task.},
projects = {Idiap,
IM2}
}

@inproceedings{Orabona_ICDL2009_2009,
title = {A theoretical framework for transfer of knowledge across modalities in artificial and cognitive systems},
author = {Orabona, F.  and Caputo, B.  and Fillbrandt, A.  and Ohl, F. },
booktitle = {International Conference on Developmental Learning},
year = {2009},
keywords = {IM2.MPR, Report_VIII},
projects = {Idiap,
DIRAC},
}

@inproceedings{Friedland03,
title = {Visual speaker localization aided by acoustic models (full paper)},
author = {Friedland, G.  and Yeo, C.  and Hung, H. },
booktitle = {Proceedings of ACM Multimedia, Beijing, China},
year = {2009},
keywords = {IM2.AP, Report_VIII}
}

@inproceedings{LTS-CONF-2008-131,
title = {Modelling human perception of static facial expressions},
author = {Sorci, M.  and Antonini, G.  and Cerretani, B.  and Cruz Mota, J.  and Rubin, T.  and Bierlaire, M.  and Thiran, J. -Ph. },
booktitle = {Face and Gesture Recognition 2008},
year = {2008},
location = {Amsterdam},
url = {http://www.fg2008.nl/},
keywords = {Report_VII, IM2.VP, LTS; LTS5; Facial Expressions modeling; discrete choice models},
abstract = {Data collected through a recent web-based survey show that the perception (i.e. labeling) of a human facial expression by a human observer is a subjective process, which results in a lack of a unique ground-truth, as intended in the standard classification framework. In this paper we propose the use of Discrete Choice Models(DCM) for human perception of static facial expressions. Random utility functions are defined in order to capture the attractiveness, perceived by the human observer for an expression class, when asked to assign a label to an actual expression image. The utilities represent a natural way for the modeler to formalize her prior knowledge on the process. Starting with a model based on Facial Action Coding Systems (FACS), we subsequently defines two other models by adding two new sets of explanatory variables. The model parameters are learned through maximum likelihood estimation and a cross-validation procedure is used for validation purposes.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/125056},
oai-id = {oai:infoscience.epfl.ch:125056},
oai-set = {conf},
review = {REVIEWED},
status = {ACCEPTED},
unit = {LTS}
}

@inbook{Friedland06,
title = {Speaker diarization and identification},
author = {Friedland, G.  and van Leeuwen, D. },
editor = {et al., P. Sheu},
booktitle = {Semantic Computing},
year = {2009},
publisher = {IEEE Press/Wiley},
keywords = {IM2.AP, Report_VIII}
}

@article{livescu07a,
title = {Manual Transcription of Conversational Speech at the Articulatory Feature Level},
author = {Livescu, K.  and Bezman, A.  and Borges, N.  and Yung, L.  and Cetin, O.  and Frankel, J.  and King, S.  and Magimai-Doss, M.  and Chi, X.  and Lavoie, L. },
journal = {Proc. ICASSP, Honolulu},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{neuhaus07quadraticProgramming,
title = {A quadratic programming approach to the graph edit distance problem},
author = {Neuhaus, M.  and Bunke, H. },
editor = {Escolano, F.  and Vento, M. },
booktitle = {Graph-Based Representations in Pattern Recognition},
series = {Lecture Notes in Computer Science},
year = {2007},
volume = {4538},
pages = {92--102},
publisher = {Springer},
keywords = {Report_VI, IM2.ACP},
peer = {yes}
}

@inproceedings{Friedland08,
title = {Fusion of short-term and long-term features for improved speaker diarization},
author = {Friedland, G.  and Vinyals, O.  and Huang, Y.  and Muller, C. },
booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Taipei, Taiwan},
year = {2009},
pages = {4077--4080},
keywords = {IM2.AP, Report_VIII}
}

@inproceedings{Friedland09,
title = {Multi-modal speaker diarization of real-world meetings using compressed-domain video features},
author = {Friedland, G.  and Hung, H.  and Yeo, C. },
booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Taipei, Taiwan},
year = {2009},
pages = {4069--4072},
keywords = {IM2.AP, Report_VIII}
}

@inproceedings{einsele07:isspa,
title = {Towards identification of very low resolution, anti-aliased characters},
author = {Einsele, F.  and Hennebert, J.  and Ingold, R. },
booktitle = {IEEE International Symposium on Signal Processing and its Applications (ISSPA'07), Sharjah, United Arab Emirates},
year = {2007},
keywords = {Report_VI, IM2.MPR}
}

@article{kolar07,
title = {Speaker Adaptation of Language Models for Automatic Dialog Act Segmentation of Meetings},
author = {Kolar, J.  and Liu, Y.  and Shriberg, E. },
journal = {to appear in Proceedings of Interspeech, Antwerp.},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@article{grangier:2008:tpami,
title = {A discriminative kernel-based model to rank images from text queries},
author = {Grangier, D.  and Bengio, S. },
crossref = {grangier:2007:idiap-07-38},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)},
year = {2008},
keywords = {IM2.MPR, Report_VII},
abstract = {This paper introduces a discriminative model for the retrieval of images from text queries. Our approach formalizes the retrieval task as a ranking problem, and introduces a learning procedure optimizing a criterion related to the ranking performance. The proposed model hence addresses the retrieval problem directly and does not rely on an intermediate image annotation task, which contrasts with previous research. Moreover, our learning procedure builds upon recent work on the online learning of kernel-based classifiers. This yields an efficient, scalable algorithm, which can benefit from recent kernels developed for image comparison. The experiments performed over stock photography data show the advantage of our discriminative ranking approach over state-of-the-art alternatives (e.g. our model yields 26.3\% average precision over the Corel dataset, which should be compared to 22.0\%, for the best alternative model evaluated). Further analysis of the results shows that our model is especially advantageous over difficult queries such as queries with few relevant pictures or multiple-word queries.}
}

@article{Vinciarelli_IEEESPM_2009,
title = {Capturing Order in Social Interactions},
author = {Vinciarelli, A. },
journal = {IEEE Signal Processing Magazine},
year = {2009},
keywords = {IM2.MCA, Report_VIII},
abstract = {As humans appear to be literally wired for social interaction, it is not surprising to observe that social aspects of human behavior and psychology attract interest in the computing community as well. The gap between social animal and unsocial machine was tolerable when computers were nothing else than improved versions of old tools (e.g., word processors replacing typewriters), but nowadays computers go far beyond that simple role. Today, computers are the natural means for a wide spectrum of new, inherently social, activities like remote communication, distance learning, online gaming, social networking, information seeking and sharing, training in virtual worlds, etc. In this new context, computers must integrate human-human interaction as seamlessly as possible and deal effectively with spontaneous social behaviors of their users. In concise terms, computers need to become socially intelligent.},
projects = {SSPNet,
IM2},
}

@article{riedhammer08,
title = {Packing the meeting summarization knapsack},
author = {Riedhammer, K.  and Gillick, D.  and Favre, B.  and Hakkani-Tur, D. },
journal = {to appear in Proceedings of Interspeech 2008, Brisbane, Australia},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@inproceedings{schlapbach06writerVerification,
title = {Off-line writer verification: a comparison of a hidden Markov model (HMM) and a Gaussian mixture model (GMM) based system},
author = {Schlapbach, A.  and Bunke, H. },
booktitle = {Proc. 10th Int. Workshop Frontiers in Handwriting Recognition},
year = {2006},
pages = {275--280},
keywords = {Report_VI, IM2.VP},
peer = {yes}
}

@inproceedings{EUSIPCO2007,
title = {A channel selection method for eeg classification in emotion assessment based on synchronization likelihoo},
author = {Ansari-Asl, K.  and Chanel, G.  and Pun, T. },
booktitle = {Eusipco 2007, 15th Eur. Signal Proc. Conf.},
year = {2007},
keywords = {Report_VI, IM2.MPR}
}

@inproceedings{Armstrong-2-ISSCO,
title = {A bidirectional grammar-based medical speech translator},
author = {Bouillon, P.  and Flores, G.  and Starlander, M.  and Chatzichrisafis, N.  and Santaholma, M.  and Tsourakis, N.  and Rayner, M.  and Hockey, B. A. },
booktitle = {Proceedings of workshop on Grammar-based approaches to spoken language processing},
year = {2007},
pages = {41--48},
publisher = {ACL 2007},
location = {Prague, Czech Republic},
keywords = {Report_VI, IM2.HMI, ACL 2007, June 29}
}

@inproceedings{millan:2008:chi,
title = {Brain-computer interfaces for hci and games},
author = {Nijholt, A.  and Tan, D.  and Allison, B.  and Millán, J. del R.  and Moore, M.  and Graimann, B. },
booktitle = {Proceedings of the 26th Annual CHI Conference on Human Factors in Computing Systems, Extended Abstracts},
year = {2008},
keywords = {IM2.BMI, Report_VII},
abstract = {In this workshop we study the research themes and the state-of-the-art of brain-computer interaction. Braincomputer interface research has seen much progress in the medical domain, for example for prosthesis control or as biofeedback therapy for the treatment of neurological disorders. Here, however, we look at brain-computer interaction especially as it applies to research in Human-Computer Interaction (HCI). Through this workshop and continuing discussions, we aim to define research approaches and applications that apply to disabled and able-bodied users across a variety of real-world usage scenarios. Entertainment and game design is one of the application areas that will be considered.}
}

@article{48937,
title = {Vibrotactile feedback for brain-computer interface operation},
author = {Cincotti, F.  and Kauhanen, L.  and Aloise, F. },
journal = {Computational Intelligence and Neuroscience},
year = {2007},
volume = {2007},
pages = {Article ID},
note = {doi:10.1155/2007/48937},
keywords = {Report_VI, IM2.BMI}
}

@techreport{li:rr08-10,
title = {A neural network based regression approach for recognizing simultaneous speech},
author = {Li, W.  and Kumatani, K.  and Dines, J.  and Magimai-Doss, M.  and Bourlard, H. },
year = {2008},
month = {September},
type = {Idiap-RR},
number = {Idiap-RR-10-2008},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {Report_VII,IM2.AP},
abstract = {This paper presents our approach for automatic speech recognition (ASR) of overlapping speech. Our system consists of two principal components: a speech separation component and a feature estmation component. In the speech separation phase, we first estimated the speaker's position, and then the speaker location information is used in a GSC-configured beamformer with a minimum mutual information (MMI) criterion, followed by a Zelinski and binary-masking post-filter, to separate the speech of different speakers. In the feature estimation phase, the neural networks are trained to learn the mapping from the features extracted from the pre-separated speech to those extracted from the close-talking microphone speech signal. The outputs of the neural networks are then used to generate acoustic features, which are subsequently used in acoustic model adaptation and system evaluation. The proposed approach is evaluated through ASR experiments on the it PASCAL Speech Separation Challenge II (SSC2) corpus. We demonstrate that our system provides large improvements in recognition accuracy compared with a single distant microphone case and the performance of ASR system can be significantly improved both through the use of MMI beamforming and feature mapping approaches.},
ipdmembership = {speech},
projects = {Idiap},
pdf = {ftp://ftp.idiap.ch/pub/reports/2008/li-idiap-rr-08-10.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2008/li-idiap-rr-08-10.ps.gz}
}

@article{huijbregts07a,
title = {The Blame Game: Performance Analysis of Speaker Diarization System Components},
author = {Huijbregts, M.  and Wooters, C. },
journal = {to appear in Proc. Interspeech, Antwerp.},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{eth_biwi_00436,
title = {Integrating recognition and reconstruction for cognitive traffic scene analysis from a moving vehicle},
author = {Leibe, B.  and Cornelis, N.  and Cornelis, K.  and van Gool, L. },
booktitle = {DAGM Annual Pattern Recognition Symposium},
series = {LNCS},
year = {2006},
volume = {4174},
pages = {192--201},
publisher = {Springer},
keywords = {Report_VI, IM2.VP}
}

@article{Orabona_PR_2009,
title = {Towards Life-long Learning for Cognitive Systems: Online Independent Support Vector Machine},
author = {Orabona, F.  and Castellini, C.  and Caputo, B.  and Luo, J.  and Sandini, G. },
journal = {Pattern Recognition},
year = {2009},
volume = {Accepted for Pub},
keywords = {IM2.MPR, Report_VIII},
projects = {Idiap,
DIRAC}
}

@techreport{Magimai.-Doss_Idiap-RR-24-2009,
title = {On Joint Modelling of Grapheme and Phoneme Information using KL-HMM for ASR},
author = {Magimai-Doss, M.  and Aradilla, G.  and Bourlard, H. },
year = {2009},
type = {Idiap-RR},
number = {Idiap-RR-24-2009},
institution = {Idiap},
keywords = {IM2.AP, Report_VIII},
abstract = {In this paper, we propose a simple approach to jointly model both grapheme and phoneme information using Kullback-Leibler divergence based HMM (KL-HMM) system. More specifically, graphemes are used as subword units and phoneme posterior probabilities estimated at output of multilayer perceptron are used as observation feature vector. Through preliminary studies on DARPA Resource Management corpus it is shown that although the proposed approach yield lower performance compared to KL-HMM system using phoneme as subword units, this gap in the performance can be bridged via temporal modelling at the observation feature vector level and contextual modelling of early tagged contextual graphemes.},
projects = {IM2,
SNSF-MULTI},
}

@article{Humm20079a,
title = {Hidden markov models for spoken signature verification},
author = {Humm, A.  and Hennebert, J.  and Ingold, R. },
year = {2007},
keywords = {Report_VII, IM2.HMI}
}

@article{Monay_EURASIPJIVP_2009,
title = {Contextual classification of image patches with latent aspect models},
author = {Monay, F.  and Quelhas, P.  and Odobez, J. -M.  and Gatica-Perez, D. },
journal = {EURASIP Journal on Image and Video Processing, Special Issue on Patches in Vision},
year = {2009},
note = {to appear},
keywords = {IM2.VP, Report_VIII},
abstract = {We present a novel approach for contextual classification of image patches in complex visual scenes, based on the use of histograms of quantized features and probabilistic aspect models. Our approach uses context in two ways: (1) by using the fact that specific learned aspects correlate with the semantic classes, which resolves some cases of visual polysemy often present in patch-based representations, and (2) by formalizing the notion that scene context is image-specific -what an individual patch represents depends on what the rest of the patches in the same image are-. We demonstrate the validity of our approach on a man-made vs. natural patch classification problem. Experiments on an image collection of complex scenes show that the proposed approach improves region discrimination, producing satisfactory results, and outperforming two non-contextual methods. Furthermore, we also show that co-occurrence and traditional (Markov Random Field) spatial contextual information can be conveniently integrated for further improved patch classification.},
projects = {Idiap,
IM2,
SNSF-MULTI},
}

@inproceedings{Ba:ICASSP:2008,
title = {Multi-party focus of attention recognition in meetings from head pose and multimodal contextual cues},
author = {Ba, S.  and Odobez, J. -M. },
booktitle = {IEEE Int. Conf. on Acoustics, Speech, and Signal Processing (ICASSP)},
year = {2008},
keywords = {Report_VII, IM2.VP},
abstract = {We address the problem of recognizing the visual focus of attention (VFOA) of meeting participants from their head pose and contextual cues. The main contribution of the paper is the use of a head pose posterior distribution as a representation of the head pose information contained in the image data. This posterior encodes the probabilities of the different head poses given the image data, and constitute therefore a richer representation of the data than the mean or the mode of this distribution, as done in all previous work. These observations are exploited in a joint interaction model of all meeting participants pose observations, VFOAs, speaking status and of environmental contextual cues. Numerical experiments on a public database of 4 meetings of 22min on average show that this change of representation allows for a 5.4\% gain with respect to the standard approach using head pose as observation.}
}

@inproceedings{bertolami07multipleClassifier,
title = {Multiple classifier methods for offline handwritten text line recognition},
author = {Bertolami, R.  and Bunke, H. },
editor = {Haindl, M.  and Kittler, J.  and Roli, F. },
booktitle = {Multiple Classifier Systems},
series = {Lecture Notes in Computer Science},
year = {2007},
volume = {4472},
pages = {72--81},
publisher = {Springer},
keywords = {Report_VI, IM2.VP},
peer = {yes}
}

@inproceedings{liwicki07featureSelection,
title = {Feature selection for on-line handwriting recognition of whiteboard notes},
author = {Liwicki, M.  and Bunke, H. },
booktitle = {Proc. 13th Conf. of the Graphonomics Society},
year = {2007},
pages = {101--105},
isbn = {978-0-7326-4003-3},
keywords = {Report_VII, IM2.VP},
peer = {yes}
}

@inproceedings{Ullah_IROS_2009,
title = {You live, you learn, you forget: continuous learning of visual places with a forgetting mechanism},
author = {Ullah, M. M.  and Orabona, F.  and Caputo, B. },
booktitle = {International Conference on Robotic and Systems},
year = {2009},
keywords = {IM2.VP, IM2.MPR, Report_VIII},
projects = {Idiap,
DIRAC}
}

@phdthesis{Scaringella_THESIS_2009,
title = {On the design of audio features robust to the album-effect for music information retrieval.},
author = {Scaringella, N. },
year = {2009},
month = {June},
school = {Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {Th\ese EPFL, no 4412 (2009). Dir.: Herv\'e Bourlard.},
keywords = {channel normalization, machine learning, music information retrieval, neural networks, rhythm, timbre, IM2.AP,Report_VIII},
abstract = {Short-term spectral features - and most notably Mel-Frequency Cepstral Coefficients (MFCCs) - are the most widely used descriptors of audio signals and are deployed in a majority of state-of-the-art Music Information Retrieval (MIR) systems. These descriptors have however demonstrated their limitations in the context of speech processing when training and testing conditions of the system do not match, like e.g. in noisy conditions or under a channel mismatch. A related problem has been observed in the context of music processing. It has indeed been hypothesized that MIR algorithms relying on the use of short-term spectral features were unexpectedly picking up on similarities in the production/mastering qualities of music albums. This problem has been referred to as the album-effect in the literature though it has never been studied in depth. It is showed in this thesis how the album-effect relates to the problem of channel mismatch. A measure of robustness to the album-effect is proposed and channel normalization techniques borrowed from the speech processing community are evaluated to help at improving the robustness of short-term spectral features. Alternatively, longer-term features describing critical-band specialized temporal patterns (TRAPs) are adapted to the context of music processing. It is shown how such features can help at describing either timbre or rhythm content depending on the scale considered for analysis and how robust they are to the album-effect. Contrarily to more classic short-term spectral descriptors, TRAP-based features encode some form of prior knowledge of the problem considered through a trained feature extraction chain. The lack of appropriately annotated datasets raises however some new issues when it comes to training the feature extraction chain. Advanced unsupervised learning strategies are considered in this thesis and evaluated against more traditional supervised approaches relying on coarse-grained annotations such as music genres. Specialized learning strategies and specialized architectures are also proposed to compensate for some inherent variability of the data due either to album-related factors or to the dependence of music signals to the tempo of the performance.},
projects = {Idiap}
}

@article{BBH-tip08,
title = {Visual attention on the sphere [in press]},
author = {{I. Bogdanova}  and {A. Bur}  and Hügli, H. },
journal = {IEEE Transactios on Image Processing},
year = {2008},
keywords = {Report_VII, IM2.VP}
}

@inproceedings{TSOURAKIS08.620,
title = {Building mobile spoken dialogue applications using regulus},
author = {Rayner, M.  and Tsourakis, N.  and Georgescul, M.  and Bouillon, P. },
editor = {(ELRA), European Language Resources Association},
booktitle = {Proceedings of the Sixth International Language Resources and Evaluation (LREC'08)},
year = {2008},
keywords = {IM2.HMI, Report_VII},
abstract = {Regulus is an Open Source platform that supports construction of rule-based medium-vocabulary spoken dialogue applications. It has already been used to build several substantial speech-enabled applications, including NASAs Clarissa procedure navigator and Geneva Universitys MedSLT medical speech translator. System like these would be far more useful if they were available on a hand-held device, rather than, as with the present version, on a laptop. In this paper we describe the Open Source framework we have developed, which makes it possible to run Regulus applications on generally available mobile devices, using a distributed client-server architecture that offers transparent and reliable integration with different types of ASR systems. We describe the architecture, an implemented calendar application prototype hosted on a mobile device, and an evaluation. The evaluation shows that performance on the mobile device is as good as performance on a normal desktop PC.}
}

@incollection{2009-deville-lncsmmi,
title = {See color: seeing colours with an orchestra},
author = {Deville, B.  and Bologna, G.  and Vinckenbosch, M.  and Pun, T. },
editor = {Lalanne, D.  and Kohlas, J. },
booktitle = {Human Machine Interaction: Research Results of the MMI Program},
series = {Lecture Notes in Computer Science},
year = {2009},
volume = {5440},
pages = {251--279},
publisher = {Springer},
note = {Subseries: Programming and Software Engineering},
keywords = {IM2.MCA, Report_VIII}
}

@inproceedings{morrison2009:ecirws,
title = {Modelling long-term relevance feedback},
author = {Morrison, D.  and Marchand-Maillet, S.  and Bruno, E. },
booktitle = {Proceedings of the ECIR Workshop on Information Retrieval over Social Networks},
year = {2009},
url = {http://viper.unige.ch/documents/pdf/morrison2009-ecirws.pdf},
keywords = {IM2.MCA, Report_VIII}
}

@article{schlapbach08writer,
title = {A writer identification system for on-line whiteboard data},
author = {Schlapbach, A.  and Liwicki, M.  and Bunke, H. },
journal = {Pattern Recognition},
year = {2008},
volume = {41},
pages = {2381--2397},
keywords = {Report_VII, IM2.VP},
peer = {yes}
}

@article{Friedland07,
title = {Prosodic and other long-term features for speaker diarization},
author = {Friedland, G.  and Vinyals, O.  and Huang, Y.  and Muller, C. },
journal = {IEEE Transactions on Audio, Speech and Language Processing},
year = {2009},
volume = {17},
number = {5},
pages = {985--993},
keywords = {IM2.AP, Report_VIII}
}

@techreport{li:rr07-55,
title = {Robust overlapping speech recognition based on neural networks},
author = {Li, W.  and Dines, J.  and Magimai-Doss, M. },
year = {2007},
type = {Idiap-RR},
number = {Idiap-RR-55-2007},
institution = {IDIAP},
keywords = {IM2.AP, Report_VII},
abstract = {We address issues for improving hands-free speech recognition performance in the presence of multiple simultaneous speakers using multiple distant microphones. In this paper, a log spectral mapping is proposed to estimate the log mel-filterbank outputs of clean speech from multiple noisy speech using neural networks. Both the mapping of the far-field speech and combination of the enhanced speech and the estimated interfering speech are investigated. Our neural network based feature enhancement method incorporates the noise information and can be viewed as a non-linear log spectral subtraction. Experimental studies on MONC corpus showed that MLP-based mapping techniques yields a improvement in the recognition accuracy for the overlapping speech.}
}

@article{livescu07b,
title = {Articulatory Feature-based Methods for Acoustic and Audio-visual speech Recognition: Summary from the 2006 JHU Summer Workshop},
author = {Livescu, K.  and Cetin, O.  and Hasegawa-Johnson, M.  and King, S.  and Bartels, C.  and Borges, N.  and Kantor, A.  and Lal, P.  and Yung, L.  and Bezman, A.  and Dawson-Haggerty, S.  and Woods, B.  and Frankel, J.  and Magimai-Doss, M.  and Saenko, K. },
journal = {Proc. ICASSP, Honolulu},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@article{Tommasi_PRL_2008,
title = {Discriminative cue integration for medical image annotation},
author = {Tommasi, T.  and Orabona, F.  and Caputo, B. },
journal = {Pattern Recognition Letters},
year = {2008},
note = {Special Issue on Automatic Annotation of Medical Images (ImageCLEF 2007, in Press},
keywords = {IM2.VP, IM2.MPR, Report_VIII},
abstract = {Automatic annotation of medical images is an increasingly important tool for physicians in their daily activity. Hospitals nowadays produce an increasing amount of data. Manual annotation is very costly and prone to human mistakes. This paper proposes a multi-cue approach to automatic medical image annotation. We represent images using global and local features. These cues are then combined using three alternative approaches, all based on the Support Vector Machine algorithm. We tested our methods on the IRMA database, and with two of the three approaches proposed here we participated in the 2007 ImageCLEFmed benchmark evaluation, in the medical image annotation track. These algorithms ranked first and fifth respectively among all submission. Experiments using the third approach also confirm the power of cue integration for this task.},
projects = {Idiap},
}

@inproceedings{MMSPL-CONF-2008-003,
title = {H.264/AVC Video Scrambling for Privacy Protection},
author = {Dufaux, F.  and Ebrahimi, T. },
booktitle = {IEEE International Conference on Image Processing (ICIP2008)},
year = {2008},
location = {San Diego},
keywords = {Report_VII, IM2.MCA,video surveillance; privacy; scrambling},
abstract = {In this paper, we address the problem of privacy in video surveillance systems. More specifically, we consider the case of H.264/AVC which is the state-of-the-art in video coding. We assume that Regions of Interest (ROI), containing privacy-sensitive information, have been identified. The content of these regions are then concealed using scrambling. More specifically, we introduce two region-based scrambling techniques. The first one pseudo-randomly flips the sign of transform coefficients during encoding. The second one is performing a pseudo-random permutation of transform coefficients in a block. The Flexible Macroblock Ordering (FMO) mechanism of H.264/AVC is exploited to discriminate between the ROI which are scrambled and the background which remains clear. Experimental results show that both techniques are able to effectively hide private information in ROI, while the scene remains comprehensible. Furthermore, the loss in coding efficiency stays small, whereas the required additional computational complexity is negligible.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/125157},
oai-id = {oai:infoscience.epfl.ch:125157},
oai-set = {conf},
review = {REVIEWED},
status = {ACCEPTED},
unit = {MMSPL}
}

@inproceedings{perrin:ecmr:2007,
title = {Bayesian controller for a novel semi-autonomous navigation concept},
author = {Perrin, X.  and Chavarriaga, R.  and Siegwart, R.  and del R. Millán, J. },
booktitle = {3rd European Conference on Mobile Robots (ECMR 2007)},
year = {2007},
note = {IDIAP-RR 07-26},
keywords = {Report_VI, IM2.BMI, major},
abstract = {This paper presents a novel concept of semi-autonomous navigation where a mobile robot evolves autonomously under the monitoring of a human user. The user provides corrective commands to the robot whenever he disagrees with the robot's navigational choices. These commands are not related to navigational values like directions or goals, but to the relevance of the robot's actions to the overall task. A binary error signal is used to correct the robot's decisions and to bring it to the desired goal location. This simple interface could easily be adapted to input systems designed for disabled people, offering them a convenient alternative to existing assistive systems. After a description of the whole concept, a special focus is given to the decisional process, which takes into account in a Bayesian way the environment perceived by the robot and the user generated signals in order to propose a navigational strategy to the human user. The strength and advantages of the proposed semi-autonomous concept are illustrated with two experiments. em Keywords: Semi-autonomous navigation, error signal, probabilistic reasoning, human-machine interaction.},
ipdmembership = {learning},
ipdxref = {techreport:perrin-idiap-rr-07-26.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/perrin-ecmr-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/perrin-ecmr-2007.ps.gz}
}

@inproceedings{MMSPL-CONF-20091-007,
title = {Influence of audio-visual attention on perceived quality of standard definition multimedia content},
author = {Lee, J. -S.  and De Simone, F.  and Ebrahimi, T. },
booktitle = {First International Workshop on Quality of Multimedia Experience (QoMEX 2009)},
year = {\bibnodate},
location = {San Diego, CA, U.S.A.},
url = {www.qomex2009.org},
keywords = {Quality assessment, Audio-visual Focus of Attention, Cross-modal interaction, Perceived quality, IM2.MCA, Report_VIII},
abstract = {When human subjects assess the quality of multimedia data, high level perceptual processes such as Focus of Attention (FoA) and eye movements are believed to play an important role in such tasks. While prior art reports incorporation of visual FoA into objective quality metrics, audio-visual FoA has been rarely addressed and utilized in spite of the importance and presence of both audio and video information in many multimedia systems. This paper explores the influence of audio-visual FoA in the perceived quality of standard definition audio-visual sequences. Results of a subjective quality assessment study are reported, where it is shown that the sound source attracts visual attention and thereby the visual degradation in the regions far from the source is less perceived when compared to sound-emitting regions.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/137257},
oai-id = {oai:infoscience.epfl.ch:137257},
oai-set = {conf},
review = {REVIEWED},
status = {ACCEPTED},
unit = {MMSPL}
}

@phdthesis{MaurizioRigamonti20081,
title = {A framework for structuring multimedia archives and for browsing efficiently through multimodal links},
author = {Rigamonti, M. },
year = {2008},
school = {University of Fribourg, Switzerland},
keywords = {IM2.HMI, Report_VIII}
}

@inproceedings{Grandvalet_NIPS_2008,
title = {Support Vector Machines with a Reject Option},
author = {Grandvalet, Y.  and Rakotomamonjy, A.  and Keshet, J.  and Canu, S. },
crossref = {Grandvalet_Idiap-RR-01-2009},
booktitle = {Proceedings of the 22nd Annual Conference on Neural Information Processing Systems},
year = {2008},
month = {December},
keywords = {IM2.AP,Report_VIII},
abstract = {We consider the problem of binary classification where the classifier may abstain instead of classifying each observation. The Bayes decision rule for this setup, known as Chow's rule, is defined by two thresholds on posterior probabilities. From simple desiderata, namely the consistency and the sparsity of the classifier, we derive the double hinge loss function that focuses on estimating conditional probabilities only in the vicinity of the threshold points of the optimal decision rule. We show that, for suitable kernel machines, our approach is universally consistent. We cast the problem of minimizing the double hinge loss as a quadratic program akin to the standard SVM optimization problem and propose an active set method to solve it efficiently. We finally provide preliminary experimental results illustrating the interest of our constructive approach to devising loss functions.},
projects = {DIRAC,
Idiap},
}

@techreport{luo:rr06-65,
title = {Svm-based transfer of visual knowledge across robotic platforms},
author = {Luo, J.  and Pronobis, A.  and Caputo, B. },
year = {2006},
type = {IDIAP-RR},
number = {65},
institution = {IDIAP},
keywords = {Report_VI, IM2.VP.HMI},
abstract = {This paper presents an SVM--based algorithm for the transfer of knowledge across robot platforms aiming to perform the same task. Our method exploits efficiently the transferred knowledge while updating incrementally the internal representation as new information is available. The algorithm is adaptive and tends to privilege new data when building the SV solution. This prevents the old knowledge to nest into the model and eventually become a possible source of misleading information. We tested our approach in the domain of vision-based place recognition. Extensive experiments show that using transferred knowledge clearly pays off in terms of performance and stability of the solution.},
ipdmembership = {vision},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/luo-idiap-rr-06-65.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/luo-idiap-rr-06-65.ps.gz}
}

@article{galan:2008:clin.neurophysiol,
title = {A brain-actuated wheelchair: asynchronous and non-invasive brain-computer interfaces for continuous control of robots},
author = {Galán, F.  and Nuttin, M.  and Lew, E.  and Ferrez, P. W.  and Vanacker, G.  and Philips, J.  and Millán, J. del R. },
journal = {Clinical Neurophysiology},
year = {2008},
number = {119},
pages = {2159--2169},
keywords = {IM2.BMI, Report_VII},
abstract = {Objective: To assess the feasibility and robustness of an asynchronous and non-invasive EEG-based Brain-Computer Interface (BCI) for continuous mental control of a wheelchair. Methods: In experiment 1 two subjects were asked to mentally drive both a real and a simulated wheelchair from a starting point to a goal along a pre-specified path. Here we only report experiments with the simulated wheelchair for which we have extensive data in a complex environment that allows a sound analysis. Each subject participated in 5 experimental sessions, each consisting of 10 trials. The time elapsed between two consecutive experimental sessions was variable (from one hour to two months) to assess the system robustness over time. The pre-specified path was divided in 7 stretches to assess the system robustness in different contexts. To further assess the performance of the brain-actuated wheelchair, subject 1 participated in a second experiment consisting of 10 trials where he was asked to drive the simulated wheelchair following 10 different complex and random paths never tried before. Results: In experiment 1 the two subjects were able to reach 100\% (subject 1) and 80\% (subject 2) of the final goals along the pre-specified trajectory in their best sessions. Different performances were obtained over time and path stretches, what indicates that performance is time and context dependent. In experiment 2, subject 1 was able to reach the final goal in 80\% of the trials. Conclusions: The results show that subjects can rapidly master our asynchronous EEG-based BCI to control a wheelchair. Also, they can autonomously operate the BCI over long periods of time without the need for adaptive algorithms externally tuned by a human operator to minimize the impact of EEG non-stationarities. This is possible because of two key components: first, the inclusion of a shared control system between the BCI system and the intelligent simulated wheelchair; second, the selection of stable user-specific EEG features that maximize the separability between the mental tasks. Significance: These results show the feasibility of continuously controlling complex robotics devices using an asynchronous and non-invasive BCI.}
}

@inproceedings{MMSPL-CONF-2008-001,
title = {Bayesian feature selection applied in a p300 brain- computer interface},
author = {Hoffmann, U.  and Yazdani, A.  and Vesin, J. M.  and Ebrahimi, T. },
booktitle = {16th European Signal Processing Conference},
year = {2008},
location = {Lausanne},
url = {http://www.eusipco2008.org/},
keywords = {Report_VII, IM2.MCA.BMI},
abstract = {Feature selection is a machine learning technique that has many interesting applications in the area of brain- computer interfaces (BCIs). Here we show how automatic relevance determination (ARD), which is a Bayesian feature selection technique, can be applied in a BCI system. We present an computationally efficient algorithm that uses ARD to com- pute sparse linear discriminants. The algorithm is tested with data recorded in a P300 BCI and with P300 data from the BCI competition 2004. The achieved classification ac- curacy is competitive with the accuracy achievable with a support vector machine (SVM). At the same time the compu- tational complexity of the presented algorithm is much lower than that of the SVM. Moreover, it is shown how visualiza- tion of the computed discriminant vectors allows to gain in- sights about the neurophysiological mechanisms underlying the P300 paradigm.},
details = {}
}

@article{bertolami08integration,
title = {Integration of n-gram language models in multiple classifier systems for offline handwritten text line recognition},
author = {Bertolami, R.  and Bunke, H. },
journal = {Int. Journal of Pattern Recognition and Art. Intelligence},
year = {2008},
volume = {22},
number = {7},
pages = {1301--1321},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@techreport{perrin:tech:2009b,
title = {Learning human habits and reactions to external events with a dynamic Bayesian network},
author = {Perrin, X.  and Colas, F.  and Pradalier, C.  and Siegwart, R. },
year = {2009},
institution = {Autonomous Systems Lab, ETHZ},
keywords = {IM2.BMI, Report_VIII}
}

@techreport{Richiardi2006ABIDphaseIreport,
title = {Applying biometrics to identity documents: implementation issues},
author = {Richiardi, J.  and Drygajlo, A. },
year = {2006},
type = {SNSF AMBAI project technical report},
institution = {Swiss Federal Institute of Technology},
keywords = {Report_VI, IM2.MPR},
owner = {Jori}
}

@techreport{motlicek:rr06-58,
title = {Wide-band perceptual audio coding based on frequency-domain linear prediction},
author = {Motlicek, P.  and Ullal, V.  and Hermansky, H. },
year = {2006},
type = {IDIAP-RR},
number = {58},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
abstract = {In this paper we propose an extension of the very low bit-rate speech coding technique, exploiting predictability of the temporal evolution of spectral envelopes, for wide-band audio coding applications. Temporal envelopes in critically band-sized sub-bands are estimated using frequency domain linear prediction applied on relatively long time segments. The sub-band residual signals, which play an important role in acquiring high quality reconstruction, are processed using a heterodyning-based signal analysis technique. For reconstruction, their optimal parameters are estimated using a closed-loop analysis-by-synthesis technique driven by a perceptual model emulating simultaneous masking properties of the human auditory system. We discuss the advantages of the approach and show some properties on challenging audio recordings. The proposed technique is capable of encoding high quality, variable rate audio signals on bit-rates below $1$bit/sample.},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/motlicek-idiap-rr-06-58.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/motlicek-idiap-rr-06-58.ps.gz}
}

@inproceedings{Garner_INTERSPEECH_2009,
title = {Real-Time ASR from Meetings},
author = {Garner, P. N.  and Dines, J.  and Hain, T.  and El Hannani, A.  and Karafiat, M.  and Korchagin, D.  and Lincoln, M.  and Wan, V.  and Zhang, L. },
crossref = {Garner_Idiap-RR-15-2009},
booktitle = {Proceedings of Interspeech},
year = {2009},
location = {Brighton, UK.},
keywords = {IM2.AP, Report_VIII},
abstract = {The AMI(DA) system is a meeting room speech recognition system that has been developed and evaluated in the context of the NIST Rich Text (RT) evaluations. Recently, the Distant Access'' requirements of the AMIDA project have necessitated that the system operate in real-time. Another more difficult requirement is that the system fit into a live meeting transcription scenario. We describe an infrastructure that has allowed the AMI(DA) system to evolve into one that fulfils these extra requirements. We emphasise the components that address the live and real-time aspects.},
projects = {AMIDA,
IM2},
}

@inproceedings{Orabona_ICRA2009_2009,
author = {Orabona, F.  and Castellini, C.  and Caputo, B.  and Fiorilla, A. E.  and Sandini, G. },
booktitle = {IEEE International conference on Robotics and Automation},
year = {2009},
keywords = {IM2.MPR, Report_VIII},
projects = {Idiap,
DIRAC},
}

@inproceedings{Hakkani-T,
title = {Towards automatic argument diagramming of multiparty meetings},
author = {Hakkani-Tur, D. },
booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). Taipei, Taiwan},
year = {2009},
keywords = {IM2.AP, Report_VIII}
}

@inproceedings{sgarimel:eusipco:2008,
title = {Emulating temporal receptive fields of auditory mid-brain neurons for automatic speech recognition},
author = {Sivaram, G. S. V. S.  and Hermansky, H. },
crossref = {sgarimel:rr08-24},
booktitle = {Proc. 16th European Signal Processing Conference (EUSIPCO)},
year = {2008},
location = {Lausanne},
note = {IDIAP-RR 08-24},
keywords = {IM2.AP, Report_VII},
abstract = {This paper proposes modifications to the Multi-resolution RASTA (MRASTA) feature extraction technique for the automatic speech recognition (ASR). By emulating asymmetries of the temporal receptive field (TRF) profiles of auditory mid-brain neurons, we obtain more than $13\%$ relative improvement in word error rate on OGI-Digits database. Experiments on TIMIT database confirm that proposed modifications are indeed useful.}
}

@article{cuendet07a,
title = {Cross-Genre Feature Comparisons for Spoken Sentence Segmentation},
author = {Cuendet, S.  and Hakkani-Tur, D.  and Shriberg, E.  and Fung, J.  and Favre, B. },
journal = {International Conference on Semantic Computing (ICSC), Irvine, CA},
year = {2007},
keywords = {Report_VII, IM2.AP}
}

@inproceedings{Armstrong-10-ISSCO,
title = {Contrasting the automatic identification of two discourse markers in multiparty dialogues},
author = {Popescu-Belis, A.  and Zufferey, S. },
booktitle = {Proceedings of SIGDIAL 2007},
series = {8th SIGdial Workshop on Discourse and Dialogue},
year = {2007},
pages = {10},
location = {Antwerp, Belgium},
keywords = {Report_VI, IM2.MCA}
}

@article{Fleuret_TPAMI_2008,
title = {Multi-Camera People Tracking with a Probabilistic Occupancy Map},
author = {Fleuret, F.  and Berclaz, J.  and Lengagne, R.  and Fua, P. },
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
year = {2008},
volume = {30},
number = {2},
pages = {267--282},
keywords = {IM2.VP, IM2.MPR, Report_VIII},
abstract = {Given two to four synchronized video streams taken at eye level and from different angles, we show that we can effectively combine a generative model with dynamic programming to accurately follow up to six individuals across thousands of frames in spite of significant occlusions and lighting changes. In addition, we also derive metrically accurate trajectories for each one of them. Our contribution is twofold. First, we demonstrate that our generative model can effectively handle occlusions in each time frame independently, even when the only data available comes from the output of a simple background subtraction algorithm and when the number of individuals is unknown a priori. Second, we show that multi-person tracking can be reliably achieved by processing individual trajectories separately over long sequences, provided that a reasonable heuristic is used to rank these individuals and avoid confusing them with one another.},
projects = {Idiap,
IM2}
}

@inproceedings{2009-bologna-iwinac,
title = {Blind navigation along a sinuous path by means of the see color interface},
author = {Bologna, G.  and Deville, B.  and Pun, T. },
booktitle = {IWINAC2009, 3rd International Work-conference on the Interplay between Natural and Artificial Computation, Santiago de Compostela, Spain, June 22--27},
year = {2009},
keywords = {IM2.MCA, Report_VIII}
}

@inproceedings{Negoescu_ACMMM,
title = {Topickr: Flickr Groups and Users Reloaded},
author = {Negoescu, R. -A.  and Gatica-Perez, D. },
crossref = {negoescu:rr08-61},
booktitle = {MM '08: Proc. of the 16th ACM Intl. Conf. on Multimedia},
year = {2008},
month = {October},
publisher = {ACM},
keywords = {IM2.MPR,Report_VIII},
abstract = {With the increased presence of digital imaging devices there also came an explosion in the amount of multimedia content available online. Users have transformed from passive consumers of media into content creators. Flickr.com is such an example of an online community, with over 2 billion photos (and more recently, videos as well), most of which are publicly available. The user interaction with the system also provides a plethora of metadata associated with this content, and in particular tags. One very important aspect in Flickr is the ability of users to organize in self-managed communities called groups. Although users and groups are conceptually different, in practice they can be represented in the same way: a bag-of-tags, which is amenable for probabilistic topic modeling. We present a topic-based approach to represent Flickr users and groups and demonstrate it with a web application, Topickr, that allows similarity based exploration of Flickr entities using their topic-based representation, learned in an unsupervised manner.},
projects = {Idiap,
IM2,
SNSF-MULTI}
}

@inproceedings{Vinciarelli_ACM-MM_2008,
title = {Social signal processing: state-of-the-art and future perspectives of an emerging domain},
author = {Vinciarelli, A.  and Pantic, M.  and Bourlard, H.  and Pentland, A. },
booktitle = {Proceedings of the ACM International Conference on Multimedia},
year = {2008},
keywords = {IM2.MCA, Report_VII},
abstract = {The ability to understand and manage social signals of a person we are communicating with is the core of social intelligence. Social intelligence is a facet of human intelligence that has been argued to be indispensable and perhaps the most important for success in life. This paper argues that next-generation computing needs to include the essence of social intelligence \^a the ability to ecognize human social signals and social behaviours like politeness, and disagreement \^a in order to become more effective and more efficient. Although each one of us understands the importance of social signals in everyday life situations, and in spite of recent advances in machine analysis of relevant behavioural cues like blinks, smiles, crossed arms, laughter, and similar, design and development of automated systems for Social Signal Processing (SSP) are rather difficult. This paper surveys the past efforts in solving these problems by a computer, it summarizes the relevant findings in social psychology, and it proposes a set of recommendations for enabling the development of the next generation of socially-aware computing.}
}

@inproceedings{MMSPL-CONF-2009-006,
title = {Subjective assessment of H.264/AVC video sequences transmitted over a noisy channel},
author = {De Simone, F.  and Naccari, M.  and Tagliasacchi, M.  and Dufaux, F.  and Tubaro, S.  and Ebrahimi, T. },
booktitle = {First International Workshop on Quality of Multimedia Experience (QoMEX 2009)},
year = {\bibnodate},
location = {San Diego, CA, U.S.A.},
url = {http://www.qomex2009.org/},
keywords = {Subjective video quality assessment; Packet loss rate; H.264/AVC; Error resilience, IM2.MCA, Report_VIII},
abstract = {In this paper we describe a database containing subjective assessment scores relative to 78 video streams encoded with H.264/AVC and corrupted by simulating the transmission over error-prone network. The data has been collected from 40 subjects at the premises of two academic institutions. Our goal is to provide a balanced and comprehensive database to enable reproducible research results in the field of video quality assessment. In order to support research works on Full-Reference, Reduced-Reference and No-Reference video quality assessment algorithms, both the uncompressed files and the H.264/AVC bitstreams of each video sequence have been made publicly available for the research community, together with the subjective results of the performed evaluations.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/137256},
oai-id = {oai:infoscience.epfl.ch:137256},
oai-set = {conf},
review = {REVIEWED},
status = {ACCEPTED},
unit = {MMSPL}
}

@article{faria08b,
title = {Corrected Tandem Features for Acoustic Model Training},
author = {Faria, A.  and Morgan, N. },
journal = {accepted for IEEE ICASSP, Las Vegas, NV},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@article{faria08a,
title = {When a mismatch can be good: large vocabulary speech recognition trained with idealized tandem features},
author = {Faria, A.  and Morgan, N. },
journal = {Proceedings of the ACM Symposium on Applied Computing, Fortaleza, Brazil},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@inproceedings{Favre_ACMMULTIMEDIA2009_2009,
title = {Automatic Role Recognition in Multiparty Recordings Using Social Networks and Probabilistic Sequential Models},
author = {Favre, S.  and Dielmann, A.  and Vinciarelli, A. },
booktitle = {ACM International Conference on Multimedia, To Appear},
year = {2009},
keywords = {IM2.MCA, Report_VIII},
abstract = {The automatic analysis of social interactions is attracting significant interest in the multimedia community. This work addresses one of the most important aspects of the problem, namely the recognition of roles in social exchanges. The proposed approach is based on Social Network Analysis, for the representation of individuals in terms of their interactions with others, and probabilistic sequential models, for the recognition of role sequences underlying the sequence of speakers in conversations. The experiments are performed over different kinds of data (around 90 hours of broadcast data and meetings), and show that the performance depends on how formal the roles are, i.e. on how much they constrain people behavior.},
projects = {Idiap,
IM2,
SSPNet},
}

@inproceedings{ganapathy:icassp:2008,
title = {Temporal masking for bit-rate reduction in audio codec based on frequency domain linear prediction},
author = {Ganapathy, S.  and Motlicek, P.  and Hermansky, H.  and Garudadri, H. },
crossref = {ganapathy:rr07-48},
booktitle = {IEEE Int. Conf. on Acoustics, Speech, and Signal Processing (ICASSP)},
year = {2008},
pages = {4781--4784},
location = {Las Vegas, NV},
isbn = {978-1-4244-1483-3},
issn = {1520-6149},
note = {IDIAP-RR 07-48},
doi = {10.1109/icassp.2008.4518726},
keywords = {IM2.AP, Report_VII},
abstract = {Audio coding based on Frequency Domain Linear Prediction (FDLP) uses auto-regressive model to approximate Hilbert envelopes in frequency sub-bands for relatively long temporal segments. Although the basic technique achieves good quality of the reconstructed signal, there is a need for improving the coding efficiency. In this paper, we present a novel method for the application of temporal masking to reduce the bit-rate in a FDLP based codec. Temporal masking refers to the hearing phenomenon, where the exposure to a sound reduces response to following sounds for a certain period of time (up to $200$ ms). In the proposed version of the codec, a first order forward masking model of the human ear is implemented and informal listening experiments using additive white noise are performed to obtain the exact noise masking thresholds. Subsequently, this masking model is employed in encoding the sub-band FDLP carrier signal. Application of the temporal masking in the FDLP codec results in a bit-rate reduction of about $10$\% without degrading the quality. Performance evaluation is done with Perceptual Evaluation of Audio Quality (PEAQ) scores and with subjective listening tests.}
}

@inproceedings{Zhao:ICSA:2008,
title = {Multi-stream spectro-temporal features for robust speech recognition},
author = {Zhao, S. Y.  and Morgan, N. },
booktitle = {9th International Conference of the ISCA (Interspeech 2008), Brisbane, Australia},
year = {2008},
pages = {898--901},
keywords = {IM2.AP, Report_VIII}
}

@inproceedings{galan:grazBCI2008:2008,
title = {Continuous brain-actuated control of an intelligent wheelchair by human eeg},
author = {Galán, F.  and Nuttin, M.  and Vanhooydonck, D.  and Lew, E.  and Ferrez, P. W.  and Philips, J.  and Millán, J. del R. },
crossref = {galan:rr08-53},
booktitle = {4th International Brain-Computer Interface Workshop & Training Course},
year = {2008},
location = {Graz University of Technology, Graz, Austria},
note = {IDIAP-RR 08-53},
keywords = {IM2.BMI, Report_VII},
abstract = {The objective of this study is to assess the feasibility of controlling an asynchronous and non-invasive brain-actuated wheelchair by human EEG. Three subjects were asked to mentally drive the wheelchair to 3 target locations using 3 mental commands. These mental commands were respectively associated with the three wheelchair steering behaviors: emphturn left, emphturn right, and emphmove forward. The subjects participated in 30 randomized trials (10 trials per target). The performance was assessed in terms of percentage of reached targets calculated in function of the distance between the final wheelchair position and the target at each trial. To assess the brain-actuated control achieved by the subjects, their performances were compared with the performance achieved by a random BCI. The subjects drove the wheelchair closer than 1 meter from the target in 20\%, 37\%, and 7\% of the trials, and closer than 2 meters in 37\%, 53\%, and 27\% of the trials, respectively. The random BCI drove it closer than 1 and 2 meters in 0\% and 13\% of the trials, respectively. The results show that the subjects could achieve a significant level of mental control, even if far from optimal, to drive an intelligent wheelchair, thus demonstrating the feasibility of continuously controlling complex robotics devices using an asynchronous and non-invasive BCI.}
}

@article{andreani06,
title = {Lets DiSCoH: Collecting an Annotated Open Corpus with Dialog Acts and Reward Signals for Natural Language Helpdesks},
author = {Andreani, G.  and Di Fabbrizio, G.  and Gilbert, M.  and Gillick, D.  and Hakkani-Tur, D.  and Lemon, O. },
journal = {Proc. IEEE/ACL Workshop on Spoken Language Technology},
year = {2006},
keywords = {Report_VI, IM2.AP}
}

@techreport{Imseng_Idiap-RR-07-2009,
title = {Novel initialization methods for Speaker Diarization},
author = {Imseng, D. },
year = {2009},
type = {Idiap-RR},
number = {Idiap-RR-07-2009},
institution = {Idiap},
note = {Master's thesis},
keywords = {IM2.AP, Report_VIII},
abstract = {Speaker Diarization is the process of partitioning an audio input into homogeneous segments according to speaker identity where the number of speakers in a given audio input is not known a priori. This master thesis presents a novel initialization method for Speaker Diarization that requires less manual parameter tuning than most current GMM/HMM based agglomerative clustering techniques and is more accurate at the same time. The thesis reports on empirical research to estimate the importance of each of the parameters of an agglomerative-hierarchical-clustering-based Speaker Diarization system and evaluates methods to estimate these parameters completely unsupervised. The parameter estimation combined with a novel non-uniform initialization method result in a system that performs better than the current ICSI baseline engine on datasets of the National Institute of Standards and Technology (NIST) Rich Transcription evaluations of the years 2006 and 2007 (17\% overall relative improvement).},
projects = {Idiap,
IM2,
AMIDA},
}

@inproceedings{liwicki07online,
title = {On-line handwritten text line detection using dynamic programming},
author = {Liwicki, M.  and Indermühle, E.  and Bunke, H. },
booktitle = {Proc. 9th Int. Conf. on Document Analysis and Recognition},
year = {2007},
pages = {447--451},
isbn = {978-0-7695-2822-9},
keywords = {Report_VII, IM2.VP},
peer = {yes}
}

@inproceedings{Richiardi2007QualityMeasures,
title = {Quality measures in unimodal and multimodal biometric verification},
author = {Richiardi, J.  and Kryszczuk, K.  and Drygajlo, A. },
booktitle = {Proc. 15th European Signal Processing Conf. (EUSIPCO)},
year = {2007},
note = {(invited paper)},
keywords = {Report_VI, IM2.MPR},
owner = {Sept. 3-7}
}

@inproceedings{LTS-CONF-2007-029,
title = {Analysis of Head Mounted Wireless Camera Videos for Early Diagnosis of Autism},
author = {Noris, B.  and Benmachiche, K.  and Meynet, J.  and Thiran, J. -Ph.  and Billard, A. },
booktitle = {International Conference on Recognition Systems},
year = {2007},
url = {http://infoscience.epfl.ch/getfile.py?recid=109492&mode=best},
keywords = {Report_VI, LTS5, IM2.VP},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/search.py?recid=109492},
oai-id = {oai:infoscience.epfl.ch:109492},
oai-set = {conf},
review = {REVIEWED},
status = {PUBLISHED},
unit = {LTS}
}

@article{guz07,
title = {Co-training Using Prosodic and Lexical Information for Sentence Segmentation},
author = {Guz, U.  and Cuendet, S.  and Hakkani-Tur, D.  and Tur, G. },
journal = {to appear in Proceedings of Interspeech, Antwerp},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{LTS-CONF-2007-020,
title = {Fisher's Discriminant and Relevant Component Analysis for static facial expression classification},
author = {Sorci, M.  and Antonini, G.  and Thiran, J. -Ph. },
booktitle = {15th European Signal Processing Conference (EUSIPCO), Poznan, Poland},
year = {2007},
location = {Poznan, Poland},
note = {ITS},
url = {http://www.eusipco2007.org/, http://infoscience.epfl.ch/getfile.py?recid=104395&mode=best},
keywords = {Report_VI, LTS5; Facial expression recognition; Dimensionality reduction; IM2.VP},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/search.py?recid=104395},
oai-id = {oai:infoscience.epfl.ch:104395},
oai-set = {conf},
review = {REVIEWED},
status = {PUBLISHED},
unit = {LTS}
}

@inproceedings{Dines_INTERSPEECH_2009,
title = {Measuring the gap between HMM-based ASR and TTS},
author = {Dines, J.  and Yamagishi, J.  and King, S. },
crossref = {Dines_Idiap-RR-16-2009},
booktitle = {Proceedings of Interspeech},
year = {2009},
month = {September},
location = {Brighton, U.K.},
keywords = {speech recognition, speech synthesis, unified models, IM2.AP,Report_VIII},
abstract = {The EMIME European project is conducting research in the development of technologies for mobile, personalised speech-tospeech translation systems. The hidden Markov model is being used as the underlying technology in both automatic speech recognition (ASR) and text-to-speech synthesis (TTS) components, thus, the investigation of unified statistical modelling approaches has become an implicit goal of our research. As one of the first steps towards this goal, we have been investigating commonalities and differences between HMM-based ASR and TTS. In this paper we present results and analysis of a series of experiments that have been conducted on English ASR and TTS systems measuring their performance with respect to phone set and lexicon, acoustic feature type and dimensionality andHMM topology. Our results show that, although the fundamental statistical model may be essentially the same, optimal ASR and TTS performance often demands diametrically opposed system designs. This represents a major challenge to be addressed in the investigation of such unified modelling approaches.},
projects = {EMIME},
}

@inproceedings{LTS-CONF-2007-025,
title = {Image alignment with rotation manifolds built on sparse geometric expansions},
author = {Kokiopoulou, E.  and Frossard, P. },
booktitle = {IEEE International Workshop on Multimedia Signal Processing},
year = {2007},
location = {Chania, Crete, Greece},
url = {http://infoscience.epfl.ch/getfile.py?recid=109422&mode=best},
keywords = {Report_VI, IM2.VP},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/search.py?recid=109422},
oai-id = {oai:infoscience.epfl.ch:109422},
oai-set = {conf},
review = {REVIEWED},
status = {ACCEPTED},
unit = {LTS}
}

@article{marcel:ieee-tpamisi:2007,
title = {Person authentication using brainwaves (eeg) and maximum a posteriori model adaptation},
author = {Marcel, S.  and del R. Millán, J. },
journal = {IEEE TRANSACTIONS ON PATTERN ANALYSIS AND MACHINE INTELLIGENCE Special Issue on Biometrics},
year = {2007},
note = {IDIAP-RR 05-81},
keywords = {Report_VI, IM2.BMI},
abstract = {In this paper, we investigate the use of brain activity for person authentication. It has been shown in previous studies that the brain-wave pattern of every individual is unique and that the electroencephalogram (EEG) can be used for biometric identification. EEG-based biometry is an emerging research topic and we believe that it may open new research directions and applications in the future. However, very little work has been done in this area and was focusing mainly on person identification but not on person authentication. Person authentication aims to accept or to reject a person claiming an identity, i.e comparing a biometric data to one template, while the goal of person identification is to match the biometric data against all the records in a database. We propose the use of a statistical framework based on Gaussian Mixture Models and Maximum A Posteriori model adaptation, successfully applied to speaker and face authentication, which can deal with only one training session. We perform intensive experimental simulations using several strict train/test protocols to show the potential of our method. We also show that there are some mental tasks that are more appropriate for person authentication than others.},
ipdmembership = {vision},
ipdxref = {techreport:marcel-idiap-rr-05-81.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/marcel-ieee-tpamisi-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/marcel-ieee-tpamisi-2007.ps.gz}
}

@inproceedings{Drygajlo2007MultimodalBiometrics,
title = {Multimodal biometrics for identity documents and smart cards european challenge},
author = {Drygajlo, A. },
booktitle = {Proc. 15th European Signal Processing Conf. (EUSIPCO)},
year = {2007},
note = {(invited paper)},
keywords = {Report_VI, IM2.MPR},
owner = {Jori}
}

@article{Volosynovskiy:2005:EPC,
title = {The edge process model and its application to information hiding capacity analysis},
author = {Voloshynovskiy, S.  and Koval, O.  and Mihcak, M. K.  and Pun, T. },
journal = {IEEE Trans. on Signal Processing},
year = {2006},
volume = {54},
number = {5},
pages = {1813--1825},
url = {http://vision.unige.ch/publications/postscript/2006/ieee-edge-process.pdf},
keywords = {Report_VI, IM2.MPR},
vgclass = {refpap},
vgproject = {watermarking}
}

@inproceedings{chavarriaga:iccn:2007,
title = {To err is human: learning from error potentials in brain-computer interfaces},
author = {Chavarriaga, R.  and Ferrez, P. W.  and del R. Millán, J. },
booktitle = {1st International Conference on Cognitive Neurodynamics (ICCN 2007)},
year = {2007},
note = {IDIAP-RR 07-37},
keywords = {Report_VI, IM2.BMI},
abstract = {Several studies describe evoked EEG potentials elicited when a subject is aware of an erroneous decision either taken by him or by an external interface. This paper studies em Error-related potentials (ErrP) elicited when a human user monitors an external system upon which he has no control whatsoever. In addition, the possibility of using the ErrPs as a learning signals to infer the user's intended strategy is also addressed. Experimental results show that single-trial recognition of correct and error trials can be achieved, allowing the fast learning of the user's strategy. These results may constitute the basis of a new kind of human-computer interaction where the former provides monitoring signals that can be used to modify the performance of the latter.This work has been supported by the Swiss National Science Foundation NCCR-IM2 and by the EC-contract number BACS FP6-IST-027140. This paper only reflects the authors' views and funding agencies are not liable for any use that may be made of the information contained herein.},
ipdmembership = {learning},
ipdxref = {techreport:chavarriaga-idiap-rr-07-37.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/chavarriaga-iccn-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/chavarriaga-iccn-2007.ps.gz}
}

@techreport{aradilla:rr08-15,
title = {Posterior features applied to speech recognition tasks with limited training data},
author = {Aradilla, G.  and Bourlard, H.  and Magimai-Doss, M. },
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-15-2008},
institution = {IDIAP},
keywords = {IM2.AP, Report_VII},
abstract = {This paper describes an approach where posterior-based features are applied in those recognition tasks where the amount of training data is insufficient to obtain a reliable estimate of the speech variability. A template matching approach is considered in this paper where posterior features are obtained from a MLP trained on an auxiliary database. Thus, the speech variability present in the features is reduced by applying the speech knowledge captured on the auxiliary database. When compared to state-of-the-art systems, this approach outperforms acoustic-based techniques and obtains comparable results to grapheme-based approaches. Moreover, the proposed method can be directly combined with other posterior-based HMM systems. This combination successfully exploits the complementarity between templates and parametric models.}
}

@inproceedings{bertolami06combination,
title = {Combination of multiple handwritten text line recognition systems with a recursive approach},
author = {Bertolami, R.  and Halter, B.  and Bunke, H. },
booktitle = {Proc. 10th Int. Workshop Frontiers in Handwriting Recognition},
year = {2006},
pages = {61--65},
keywords = {Report_VI, IM2.VP},
peer = {yes}
}

@inproceedings{luethy07segmentation,
title = {Using hidden Markov models as a tool for handwritten text line segmentation},
author = {Lüthy, F.  and Varga, T.  and Bunke, H. },
booktitle = {Proc. 9th Int. Conf. on Document Analysis and Recognition},
year = {2007},
pages = {8--12},
isbn = {978-0-7695-2822-9},
keywords = {Report_VII, IM2.VP},
peer = {yes}
}

@inproceedings{Gottlieb04,
title = {On the use of artificial conversation data for speaker recognition in cars},
author = {Gottlieb, L.  and Friedland, G. },
booktitle = {IEEE International Conference for Semantic Computing, Berkeley, USA},
year = {2009},
keywords = {IM2.AP, Report_VIII}
}

@incollection{morrison2009:wei,
title = {capturing the semantics of user interaction: a review and case study},
author = {Morrison, D.  and Bruno, E.  and Marchand-Maillet, S. },
booktitle = {Emergent Web Intelligence},
year = {2009},
publisher = {Springer},
keywords = {IM2.MCA, Report_VIII}
}

@inproceedings{morrison07-hierarchical,
title = {Hierarchical long-term learning for automatic image annotation},
author = {Morrison, D.  and Marchand-Maillet, S.  and Bruno, E. },
booktitle = {Proceedings 2nd International Conference on Semantic and Digital Media Technologies},
year = {2007},
keywords = {Report_VII, IM2.MCA}
}

@inproceedings{Kittler2007SPIE,
title = {Quality dependent fusion of intramodal and multimodal biometric experts},
author = {Kittler, J.  and Poh, N.  and Fatukasi, O.  and Messer, K.  and Kryszczuk, K.  and Richiardi, J.  and Drygajlo, A. },
booktitle = {Proc. SPIE Defense and Security Symposium},
year = {2007},
keywords = {Report_VI, IM2.MPR},
owner = {Jori}
}

@inproceedings{wuethrich09language,
title = {Language model integration for the recognition of handwritten medieval documents},
author = {Wuthrich, M.  and Liwicki, M.  and Fischer, A.  and Indermühle, E.  and Bunke, H.  and Viehhauser, G.  and Stolz, M. },
booktitle = {Proc. 10th Int. Conf. on Document Analysis and Recognition},
year = {2009},
volume = {1},
pages = {211--215},
isbn = {978-0-7695-3725-2},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@techreport{Popescu-Belis_Idiap-RR-11-2009,
title = {Comparing meeting browsers using a task-based evaluation method},
author = {Popescu-Belis, A. },
year = {2009},
type = {Idiap-RR},
number = {Idiap-RR-11-2009},
institution = {Idiap},
keywords = {IM2.HMI, Report_VIII},
abstract = {Information access within meeting recordings, potentially transcribed and augmented with other media, is facilitated by the use of meeting browsers. To evaluate their performance through a shared benchmark task, users are asked to discriminate between true and false parallel statements about facts in meetings, using different browsers. This paper offers a review of the results obtained so far with five types of meeting browsers, using similar sets of statements over the same meeting recordings. The results indicate that state-of-the-art speed for true/false question answering is 1.5-2 minutes per question, and precision is 70\%-80\% (vs. 50\% random guess). The use of ASR compared to manual transcripts, or the use of audio signals only, lead to a perceptible though not dramatic decrease in performance scores.},
projects = {Idiap,
IM2,
AMIDA},
}

@article{Vijayasenan_TASLP_2009,
title = {An Information Theoretic Approach to Speaker Diarization of Meeting Data},
author = {Vijayasenan, D.  and Valente, F.  and Bourlard, H. },
crossref = {vijayasenan:rr08-58},
journal = {IEEE Transactions on Audio Speech and Language Processing},
year = {2009},
volume = {17},
number = {7},
pages = {1382--1393},
doi = {10.1109/tasl.2009.2015698},
keywords = {IM2.AP, Report_VIII},
abstract = {A speaker diarization system based on an information theoretic framework is described. The problem is formulated according to the Information Bottleneck (IB) principle. Unlike other approaches where the distance between speaker segments is arbitrarily introduced, the IB method seeks the partition that maximizes the mutual information between observations and variables relevant for the problem while minimizing the distortion between observations. This solves the problem of choosing the distance between speech segments, which becomes the Jensen- Shannon divergence as it arises from the IB objective function optimization. We discuss issues related to speaker diarization using this information theoretic framework such as the criteria for inferring the number of speakers, the trade-off between quality and compression achieved by the diarization system, and the algorithms for optimizing the objective function. Furthermore we benchmark the proposed system against a state-of-the-art system on the NIST RT06 (Rich Transcription) data set for speaker diarization of meetings. The IB based system achieves a Diarization Error Rate of 23.2\% compared to 23.6\% for the baseline system. This approach being mainly based on nonparametric clustering, it runs significantly faster than the baseline HMM/GMM based system, resulting in faster-than-real-time diarization.},
projects = {Idiap,
IM2,
AMIDA},
}

@inproceedings{renals:asru:2007,
title = {Recognition and understanding of meetings the ami and amida projects},
author = {Renals, S.  and Hain, T.  and Bourlard, H. },
crossref = {renals:rr07-46},
booktitle = {Proc. of the IEEE Workshop on Automatic Speech Recognition and Understanding, ASRU'07},
year = {2007},
pages = {238--247},
location = {Kyoto},
isbn = {978-1-4244-1746-9},
note = {IDIAP-RR 07-46},
doi = {10.1109/asru.2007.4430116},
keywords = {IM2.MCA, Report_VII},
abstract = {The AMI and AMIDA projects are concerned with the recognition and interpretation of multiparty meetings. Within these projects we have: developed an infrastructure for recording meetings using multiple microphones and cameras; released a 100 hour annotated corpus of meetings; developed techniques for the recognition and interpretation of meetings based primarily on speech recognition and computer vision; and developed an evaluation framework at both component and system levels. In this paper we present an overview of these projects, with an emphasis on speech recognition and content extraction.}
}

@inproceedings{sgarimel:is:2008,
title = {Introducing temporal asymmetries in feature extraction for automatic speech recognition},
author = {Sivaram, G. S. V. S.  and Hermansky, H. },
crossref = {sgarimel:rr08-25},
booktitle = {Interspeech 2008},
year = {2008},
location = {Brisbane, Australia},
note = {IDIAP-RR 08-25},
keywords = {IM2.AP, Report_VII},
abstract = {We propose a new auditory inspired feature extraction technique for automatic speech recognition (ASR). Features are extracted by filtering the temporal trajectory of spectral energies in each critical band of speech by a bank of finite impulse response (FIR) filters. Impulse responses of these filters are derived from a modified Gabor envelope in order to emulate asymmetries of the temporal receptive field (TRF) profiles observed in higher level auditory neurons. We obtain $11.4\%$ relative improvement in word error rate on OGI-Digits database and, $3.2\%$ relative improvement in phoneme error rate on TIMIT database over the MRASTA technique.}
}

@article{2009-bologna-neurocomputing,
title = {On the use of the auditory pathway to represent image scenes in real-time},
author = {Bologna, G.  and Deville, B.  and Pun, T. },
journal = {Neurocomputing},
year = {2009},
volume = {72},
pages = {839--849},
keywords = {IM2.MCA, Report_VIII}
}

@inproceedings{nscaringella:ismir:2008,
title = {Timbre and Rhythmic TRAP-TANDEM features for music information retrieval},
author = {Scaringella, N. },
crossref = {nscaringella:rr08-46},
booktitle = {"Int. Conf. on Music Information Retrieval (ISMIR)"},
year = {2008},
keywords = {IM2.AP, Report_VIII},
abstract = {The enormous growth of digital music databases has led to a comparable growth in the need for methods that help users organize and access such information. One area in particular that has seen much recent research activity is the use of automated techniques to describe audio content and to allow for its identification, browsing and retrieval. Conventional approaches to music content description rely on features characterizing the shape of the signal spectrum in relatively short-term frames. In the context of Automatic Speech Recognition (ASR), Hermansky citeHermansky_1 described an interesting alternative to short-term spectrum features, the TRAP-TANDEM approach which uses long-term band-limited features trained in a supervised fashion. We adapt this idea to the specific case of music signals and propose a generic system for the description of temporal patterns. The same system with different settings is able to extract features describing either timbre or rhythmic content. The quality of the generated features is demonstrated in a set of music retrieval experiments and compared to other state-of-the-art models.},
ipdmembership = {speech},
projects = {Idiap},
pdf = {ftp://ftp.idiap.ch/pub/papers/2008/scaringella-ismir-2008.pdf}
}

@inproceedings{Li2007,
title = {Non-linear spectral stretching for in-car speech recognition},
author = {Li, W.  and Bourlard, H. },
booktitle = {Interspeech},
year = {2007},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@inproceedings{chavarriaga:eusipco:2008,
title = {Asynchronous detection and classification of oscillatory brain activity},
author = {Chavarriaga, R.  and Galán, F.  and Millán, J. del R. },
crossref = {chavarriaga:rr08-36},
booktitle = {16 European Signal Processing Conference (EUSIPCO 2008)},
year = {2008},
location = {Lausanne},
note = {IDIAP-RR 08-36},
keywords = {IM2.BMI, Report_VII},
abstract = {The characterization and recognition of electrical signatures of brain activity constitutes a real challenge. Applications such as Brain-Computer Interfaces (BCI) are based on the accurate identification of mental processes in order to control external devices. Traditionally, classification of brain activity patterns relies on the assumption that the neurological phenomena that characterize mental states is continuously present in the signal. However, recent evidence shows that some mental processes are better characterized by episodic activity that is not necessarily synchronized with external stimuli. In this paper, we present a method for classification of mental states based on the detection of this episodic activity. Instead of performing classification on all available data, the proposed method identifies informative samples based on the class sample distribution in a projected canonical feature space. Classification results are compared to traditional methods using both artificial data and real EEG recordings.}
}

@inproceedings{Li2008,
title = {A neural network based regression approach for recogninizing simultaneous speech},
author = {Li, W.  and Kumatani, K.  and Dines, J.  and Magimai-Doss, M.  and Bourlard, H. },
booktitle = {Joint Workshop on Machine Learning and Multimodal Interaction},
year = {2008},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@phdthesis{silviachiappa:phd:2006,
title = {Analysis and classification of eeg signals using probabilistic models for brain computer interfaces},
author = {Chiappa, S. },
year = {2006},
school = {\'Ecole Polytechnique F\'ed\'erale de Lausanne},
keywords = {Report_VI, IM2.BMI},
ipdmembership = {Learning},
ipdxref = {techreport:silviachiappa-idiap-rr-06-48.bib},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/silviachiappa-phd-2006.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/silviachiappa-phd-2006.ps.gz}
}

@inproceedings{Armstrong-7-ISSCO,
title = {The wizard of oz meets multimodal language-enabled gui interfaces: new challenges},
author = {Lisowska, A.  and Armstrong, S.  and Melichar, M.  and Ailomaa, M.  and Rajman, M. },
booktitle = {Proceedings of CHI' 07},
series = {Beyond Current User Research: Designing Methods for New Users, T},
year = {2007},
location = {San Jos\'e, California},
keywords = {Report_VI, IM2.HMI, joint publication, April 28 - May 3}
}

@inproceedings{Drygajlo4,
title = {On quality of quality measures for classification},
author = {Kryszczuk, K.  and Drygajlo, A. },
journal = {Lecture Notes in Computer Science 5372, Biometrics and Identity Management},
year = {2008},
edition = {B. Schouten, N. Juul, A. Drygajlo, M. Tistarelli, (Eds.)},
pages = {19--28},
publisher = {Springer},
keywords = {IM2.MPR, Report_VIII}
}

@article{Vinciarelli_JIVC_2009,
title = {Social Signal Processing: Survey of an Emerging Domain},
author = {Vinciarelli, A.  and Pantic, M.  and Bourlard, H. },
journal = {Image and Vision Computing},
year = {2009},
note = {to appear},
keywords = {Computer Vision, human behaviour analysis, Social Interactions, Social signals, speech processing, IM2.MCA,Report_VIII},
abstract = {The ability to understand and manage social signals of a person we are communicating with is the core of social intelligence. Social intelligence is a facet of human intelligence that has been argued to be indispensable and perhaps the most important for success in life. This paper argues that next-generation computing needs to include the essence of social intelligence - the ability to recognize human social signals and social behaviours like turn taking, politeness, and disagreement - in order to become more effective and more efficient. Although each one of us understands the importance of social signals in everyday life situations, and in spite of recent advances in machine analysis of relevant behavioural cues like blinks, smiles, crossed arms, laughter, and similar, design and development of automated systems for Social Signal Processing (SSP) are rather difficult. This paper surveys the past efforts in solving these problems by a computer, it summarizes the relevant findings in social psychology, and it proposes a set of recommendations for enabling the development of the next generation of socially-aware computing.},
projects = {SSPNet,
IM2},
}

@inproceedings{MMSPL-CONF05-2009-011,
title = {Multimodal person search combining information fusion and relevance feedback},
author = {Goldmann, L.  and Samour, A.  and Ebrahimi, T.  and Sikora, T. },
booktitle = {IEEE International Workshop on Multimedia Signal Processing (MMSP 2009)},
year = {\bibnodate},
location = {Rio de Janeiro, Brazil},
url = {http://mmsp09.org/},
keywords = {content based multimedia retrieval, sensing people, relevance feedback; multimodal fusion, IM2.MCA, Report_VIII},
abstract = {http://mmsp09.org/},
details = {http://infoscience.epfl.ch/record/139546},
oai-id = {oai:infoscience.epfl.ch:139546},
oai-set = {conf},
unit = {MMSPL}
}

@inproceedings{hung:MM:2007,
title = {Using audio and video features to classify the most dominant person in a group meeting},
author = {Hung, H.  and Jayagopi, D.  and Yeo, C.  and Friedland, G.  and Ba, S.  and Odobez, J. -M.  and Ramchandran, K.  and Mirghafori, N.  and Gatica-Perez, D. },
year = {2007},
note = {IDIAP-RR 07-29},
keywords = {Report_VI, IM2.MPR, joint publication},
abstract = {The automated extraction of semantically meaningful information from multi-modal data is becoming increasingly necessary due to the escalation of captured data for archival. A novel area of multi-modal data labelling, which has received relatively little attention, is the automatic estimation of the most dominant person in a group meeting. In this paper, we provide a framework for detecting dominance in group meetings using different audio and video cues. We show that by using a simple model for dominance estimation we can obtain promising results.}
}

@inproceedings{joint-Bunke-Humm,
title = {Graph sequence visualisation and its application to computer network monitoring and abnormal event detection},
author = {Bunke, H.  and Dickinson, P.  and Humm, A.  and Irniger, C.  and Kraetzl, M. },
editor = {Kandel, A.  and Bunke, H.  and Last, M. },
booktitle = {Applied Graph Theory in Computer Vision and Pattern Recognition},
year = {2007},
pages = {227--245},
publisher = {Springer},
keywords = {Report_VI, IM2.ACP, joint publication}
}

@incollection{millan:2006:mit-error,
title = {Error-related eeg potentials in brain-computer interfaces},
author = {Ferrez, P. W.  and Millán, J. del R. },
editor = {Dornhege, G.  and Millán, J. del R.  and Hinterberger, T.  and McFarland, D.  and Müller, K. -R. },
booktitle = {Towards Brain-Computer Interfacing},
year = {2007},
publisher = {The MIT Press},
keywords = {IM2.BCI, Report_VII},
abstract = {Brain-computer interfaces (BCI), as any other interaction modality based on physiological signals and body channels (e.g., muscular activity, speech and gestures), are prone to errors in the recognition of subject's intent. An elegant approach to improve the accuracy of BCIs consists in a verification procedure directly based on the presence of error-related potentials (ErrP) in the EEG recorded right after the occurrence of an error. Most of these studies show the presence of ErrP in typical choice reaction tasks where subjects respond to a stimulus and ErrP arise following errors due to the subject's incorrect motor action. However, in the context of a BCI, the central question is: "Are ErrP also elicited when the error is made by the interface during the recognition of the subject's intent?" We have thus explored whether ErrP also follow a feedback indicating incorrect responses of the interface and no longer errors of the subject himself. Four healthy volunteer subjects participated in a simple human-robot interaction experiment (i.e., bringing the robot to either the left or right side of a room), which seem to reveal a new kind of ErrP. These "interaction ErrP" exhibit a first sharp negative peak followed by a broader positive peak and a second negative peak ( 270, 400 and 550 ms after the feedback, respectively). But in order to exploit these ErrP we need to detect it in each single trial using a short window following the feedback that shows the response of the classifier embedded in the BCI. We have achieved an average recognition rate of correct and erroneous single trials of 83.7\% and 80.2\%, respectively. We also show that the integration of these ErrP in a BCI, where the subject's intent is not executed if an ErrP is detected, significantly improves the performance of the BCI.}
}

@article{AndreasHumm20084,
title = {Spoken signature for user authentication},
author = {Humm, A.  and Hennebert, J.  and Ingold, R. },
journal = {SPIE Journal of Electronic Imaging},
year = {2008},
volume = {17},
keywords = {IM2.MPR, Report_VIII}
}

@inproceedings{schlapbach08estimating,
title = {Estimating the readability of handwritten text -- a support vector regression based approach},
author = {Schlapbach, A.  and Bunke, H.  and Wettstein, F. },
booktitle = {Proc. 19th Int. Conf. on Pattern Recognition},
year = {2008},
publisher = {IEEE},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@inproceedings{valente:Interspeech-2:2007,
title = {Multi-stream features combination based on dempster-shafer rule for lvcsr system},
author = {Valente, F.  and Vepa, J.  and Hermansky, H. },
booktitle = {Interspeech 2007},
year = {2007},
note = {IDIAP-RR 07-09},
keywords = {Report_VI, IM2.AP.MPR, joint publication},
abstract = {This paper investigates the combination of two streams of acoustic features. Extending our previous work on small vocabulary task, we show that combination based on Dempster-Shafer rule outperforms several classical rules like sum, product and inverse entropy weighting even in LVCSR systems. We analyze results in terms of Frame Error Rate and Cross Entropy measures. Experimental framework uses meeting transcription task and results are provided on RT05 evaluation data. Results are consistent with what has been previously observed on smaller databases.},
ipdmembership = {speech},
ipdxref = {techreport:valente-idiap-rr-07-09.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/valente-Interspeech-2-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/valente-Interspeech-2-2007.ps.gz}
}

@article{PerezFreire:AA:IFS2006,
title = {An Accurate Analysis of Scalar Quantization-Based Data Hiding},
author = {P\'erez-Freire, L.  and P\'erez-González, F.  and Voloshynovskiy, S. },
journal = {IEEE Trans. on Information Forensics and Security},
booktitle = {IEEE Transactions on Information Forensics and Security},
year = {2006},
volume = {1},
number = {1},
pages = {80--86},
url = {http://vision.unige.ch/publications/postscript/2005/PerezFreirePerezGonzalezVolosh ynovskiy_IFS2006.pdf},
keywords = {Report_VI, IM2.MPR},
vgclass = {refpap},
vgproject = {watermarking}
}

@techreport{gaudard:rr07-02,
title = {Speech recognition based on template matching and phone posterior probabilities},
author = {Gaudard, C.  and Aradilla, G.  and Bourlard, H. },
year = {2007},
type = {IDIAP-COM},
number = {02},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2007/gaudard-idiap-com-07-02.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2007/gaudard-idiap-com-07-02.ps.gz}
}

@techreport{dines:rr07-13,
title = {Direct optimisation of a multilayer perceptron for the estimation of cepstral mean and variance statistics},
author = {Dines, J.  and Vepa, J. },
year = {2007},
type = {IDIAP-RR},
number = {13},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
abstract = {We propose an alternative means of training a multilayer perceptron for the task of speech activity detection based on a criterion to minimise the error in the estimation of mean and variance statistics for speech cepstrum based features using the Kullback-Leibler divergence. We present our baseline and proposed speech activity detection approaches for multi-channel meeting room recordings and demonstrate the effectiveness of the new criterion by comparing the two approaches when used to carry out cepstrum mean and variance normalisation of features used in our meeting ASR system.},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2007/dines-idiap-rr-07-13.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2007/dines-idiap-rr-07-13.ps.gz}
}

@techreport{dines:rr07-12,
title = {A study of phoneme and grapheme based context-dependent asr systems},
author = {Dines, J.  and Magimai-Doss, M. },
year = {2007},
type = {IDIAP-RR},
number = {12},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP, major},
abstract = {In this paper we present a study of automatic speech recognition systems using context-dependent phonemes and graphemes as sub-word units based on the conventional HMM/GMM system as well as tandem system. Experimental studies conducted on three different continuous speech recognition tasks show that systems using only context-dependent graphemes can yield competitive performance on small to medium vocabulary tasks when compared to a context-dependent phoneme-based automatic speech recognition system. In particular, we demonstrate the utility of tandem features that use an MLP trained to estimate phoneme posterior probabilities in improving grapheme based recognition system performance by incorporating phonemic knowledge into the system without having to explicitly define a phonetically transcribed lexicon.},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2007/dines-idiap-rr-07-12.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2007/dines-idiap-rr-07-12.ps.gz}
}

@inproceedings{HungGatica08,
title = {Identifying dominant people in meetings from audio-visual sensors},
author = {Hung, H.  and Gatica-Perez, D. },
booktitle = {Proc. IEEE Int. Conf. on Automatic Face and Gesture Recognition (FG), Special Session on Multi-Sensor HCI for Smart Environments},
year = {2008},
keywords = {IM2.MPR, Report_VIII}
}

@article{huijbregts07b,
title = {Filtering the Unknown: Speech Activity Detection in Heterogeneous Video Collections},
author = {Huijbregts, M.  and Wooters, C.  and Ordelman, R. },
journal = {to appear in Proceedings of Interspeech, Antwerp},
year = {2007},
keywords = {Report_VI, IM2.AP}
}

@inproceedings{Armstrong-6-ISSCO,
title = {Minimizing modality bias when exploring input preference for multimodal systems in new domains: the archivus case study},
author = {Lisowska, A.  and Betrancourt, M.  and Armstrong, S.  and Rajman, M. },
booktitle = {CHI' 07},
year = {2007},
location = {San Jos\'e, California},
keywords = {Report_VI, IM2.HMI, joint publication, major, April 28 - May 3}
}

@article{KokPirFro-ICPR.08,
title = {Graph-based classification for multiple observations of transformed patterns},
author = {Kokiopoulou, E.  and Pirillos, S.  and Frossard, P. },
journal = {IEEE Int. Conf. Pattern Recognition (ICPR)},
year = {2008},
keywords = {Report_VII, IM2.DMA.VP, joint publication}
}

@inproceedings{Luo_ACCV09,
title = {An online framework for learning novel concepts over multiple cues},
author = {Luo, J.  and Orabona, F.  and Caputo, B. },
booktitle = {Proceeding of The 9th Asian Conference on Computer Vision},
year = {2009},
location = {Xi'an, China},
keywords = {IM2.MPR, Report_VIII},
abstract = {We propose an online learning algorithm to tackle the problem of learning under limited computational resources in a teacher-student scenario, over multiple visual cues. For each separate cue, we train an online learning algorithm that sacrifices performance in favor of bounded memory growth and fast upyearof the solution. We then recover back performance by using multiple cues in the online setting. To this end, we use a two-layers structure. In the first layer, we use a budget online learning algorithm for each single cue. Thus, each classifier provides confidence interpretations for target categories. On top of these classifiers, a linear online learning algorithm is added to learn the combination of these cues. As in standard online learning setups, the learning takes place in rounds. On each round, a new hypothesis is estimated as a function of the previous one. We test our algorithm on two student-teacher experimental scenarios and in both cases results show that the algorithm learns the new concepts in real time and generalizes well.},
projects = {Idiap,
DIRAC},
}

@phdthesis{just-thesis-2006,
title = {Two-handed gestures for human-computer interaction},
author = {Just, A. },
year = {2006},
type = {IDIAP-RR},
school = {\'Ecole Polytechnique F\'ed\'erale de Lausanne},
note = {PhD Thesis #3683 at the \'Ecole Polytechnique F\'ed\'erale de Lausanne},
keywords = {Report_VI, IM2.VP, Human-Computer Interaction, Computer Vision, Hand Posture Recognition, Modified Census Transform, Hand Gesture Recognition, Hidden Markov Model, Input-Output Hidden Markov Model},
abstract = {The present thesis is concerned with the development and evaluation (in terms of accuracy and utility) of systems using hand postures and hand gestures for enhanced Human-Computer Interaction (HCI). In our case, these systems are based on vision techniques, thus only requiring cameras, and no other specific sensors or devices. When dealing with hand movements, it is necessary to distinguish two aspects of these hand movements: the textitstatic aspect and the textitdynamic aspect. The static aspect is characterized by a pose or configuration of the hand in an image and is related to the Hand Posture Recognition (HPR) problem. The dynamic aspect is defined either by the trajectory of the hand, or by a series of hand postures in a sequence of images. This second aspect is related to the Hand Gesture Recognition (HGR) task. Given the recognized lack of common evaluation databases in the HGR field, a first contribution of this thesis was the collection and public distribution of two databases, containing both one- and two-handed gestures, which part of the results reported here will be based upon. On these databases, we compare two state-of-the-art models for the task of HGR. As a second contribution, we propose a HPR technique based on a new feature extraction. This method has the advantage of being faster than conventional methods while yielding good performances. In addition, we provide comparison results of this method with other state-of-the-art technique. Finally, the most important contribution of this thesis lies in the thorough study of the state-of-the-art not only in HGR and HPR but also more generally in the field of HCI. The first chapter of the thesis provides an extended study of the state-of-the-art. The second chapter of this thesis contributes to HPR. We propose to apply for HPR a technique employed with success for face detection. This method is based on the Modified Census Transform (MCT) to extract relevant features in images. We evaluate this technique on an existing benchmark database and provide comparison results with other state-of-the-art approaches. The third chapter is related to HGR. In this chapter we describe the first recorded database, containing both one- and two-handed gestures in the 3D space. We propose to compare two models used with success in HGR, namely Hidden Markov Models (HMM) and Input-Output Hidden Markov Model (IOHMM). The fourth chapter is also focused on HGR but more precisely on two-handed gesture recognition. For that purpose, a second database has been recorded using two cameras. The goal of these gestures is to manipulate virtual objects on a screen. We propose to investigate on this second database the state-of-the-art sequence processing techniques we used in the previous chapter. We then discuss the results obtained using different features, and using images of one or two cameras. In conclusion, we propose a method for HPR based on new feature extraction. For HGR, we provide two databases and comparison results of two major sequence processing techniques. Finally, we present a complete survey on recent state-of-the-art techniques for both HPR and HGR. We also present some possible applications of these techniques, applied to two-handed gesture interaction. We hope this research will open new directions in the field of hand posture and gesture recognition.},
ipdmembership = {vision},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/just-idiap-rr-06-73.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/just-idiap-rr-06-73.ps.gz}
}

@article{graves09novel,
title = {A novel connectionist system for unconstrained handwriting recognition},
author = {Graves, A.  and Liwicki, M.  and Fernandez, S.  and Bertolami, R.  and Bunke, H.  and Schmidhuber, J. },
journal = {IEEE Trans. PAMI},
year = {2009},
volume = {31},
number = {5},
pages = {855--869},
issn = {0162-8828},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@inproceedings{Parthasarathi_TSD2008_2008,
title = {Exploiting Contextual Information for Speech/Non-Speech Detection},
author = {Parthasarathi, S. H. K.  and Motlicek, P.  and Hermansky, H. },
crossref = {parthasarathi:rr08-22},
booktitle = {Text, Speech and Dialogue},
series = {Series of Lecture Notes In Artificial Intelligence (LNAI)},
year = {2008},
volume = {5246},
pages = {451--459},
publisher = {Springer-Verlag Berlin, Heidelberg},
location = {Brno, Czech Republic},
isbn = {978-3-540-87390-7},
keywords = {IM2.AP, Report_VIII},
abstract = {In this paper, we investigate the effect of temporal context for speech/non-speech detection (SND). It is shown that even a simple feature such as full-band energy, when employed with a large-enough context, shows promise for further investigation. Experimental evaluations on the test data set, with a state-of-the-art multi-layer perceptron based SND system and a simple energy threshold based SND method, using the F-measure, show an absolute performance gain of 4.4\% and 5.4\% respectively. The optimal contextual length was found to be 1000 ms. Further numerical optimizations yield an improvement (3.37\% absolute), resulting in an absolute gain of 7.77\% and 8.77\% over the MLP based and energy based methods respectively. ROC based performance evaluation also reveals promising performance for the proposed method, particularly in low SNR conditions.},
projects = {Idiap},
}

@article{singla08,
title = {Cross-lingual sentence extraction for information distillation},
author = {Singla, A.  and Hakkani-Tur, D. },
journal = {to appear in Proceedings of Interspeech 2008, Brisbane, Australia},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@incollection{Shriberg2007,
title = {Higher level features in speaker recognition},
author = {Shriberg, E. },
editor = {Muller, C. },
booktitle = {Speaker Classification I},
year = {2007},
publisher = {Lecture Notes in Computer Science, Springer},
keywords = {Report_VII, IM2.AP},
owner = {dines}
}

@article{TNSRE2007,
title = {Eeg-based synchronized brain-computer interfaces: a model for optimizing the number of mental tasks},
author = {Kronegg, J.  and Chanel, G.  and Voloshynovskiy, S.  and Pun, T. },
journal = {IEEE Trans. on Neural Systems and Rehabilitation Engineering},
year = {2007},
volume = {15},
number = {1},
pages = {50--58},
keywords = {Report_VI, IM2.MPR}
}

@techreport{ketabdar:rr06-68,
title = {Identifying unexpected words using in-context and out-of-context phoneme posteriors},
author = {Ketabdar, H.  and Hermansky, H. },
year = {2006},
type = {IDIAP-RR},
number = {68},
institution = {IDIAP},
keywords = {Report_VI, IM2.AP},
abstract = {The paper proposes and discusses a machine approach for identification of unexpected (zero or low probability) words. The approach is based on use of two parallel recognition channels, one channel employing sensory information from the speech signal together with a prior context information provided by the pronunciation dictionary and grammatical constraints, to estimate in-context' posterior probabilities of phonemes, the other channel being independent of the context information and entirely driven by the sensory data to deliver estimates of out-of-context' posterior probabilities of phonemes. A significant mismatch between the information from these two channels indicates unexpected word. The viability of this concept is demonstrated on identification of out-of-vocabulary digits in continuous digit streams. The comparison of these two channels provides a confidence measure on the output of the recognizer. Unlike conventional confidence measures, this measure is not relying on phone and word segmentation (boundary detection), thus it is not affected by possibly imperfect segment boundary detection. In addition, being a relative measure, it is more discriminative than the conventional posterior based measures.},
ipdmembership = {speech},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/ketabdar-idiap-rr-06-68.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/ketabdar-idiap-rr-06-68.ps.gz}
}

@incollection{bunke08matching,
title = {Matching of hypergraphs -- algorithms, applications, and experiments},
author = {Bunke, H.  and Dickinson, P.  and Neuhaus, M.  and Stettler, M. },
editor = {Bunke, H.  and Kandel, A.  and Last, M. },
booktitle = {Applied Pattern Recognition},
year = {2008},
pages = {131--154},
publisher = {Springer},
keywords = {Report_VII, IM2.VP},
peer = {yes}
}

@inproceedings{Shasha10101,
title = {Leveraging sentence weights in a concept-based optimization framework for extractive meeting summarization},
author = {Xie, S.  and Favre, B.  and Hakkani-Tur, D.  and Liu, Y. },
booktitle = {10th International Conference of the International Speech Communication Association, Brighton, UK},
year = {2009},
keywords = {IM2.AP, Report_VIII}
}

@article{KokFro-TPAMI.08,
title = {Minimum distance between pattern transformation manifolds: algorithm and applications},
author = {Kokiopoulou, E.  and Frossard, P. },
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
year = {2008},
keywords = {Report_VII, IM2.DMA.VP, joint},
status = {in press}
}

@article{levit07,
title = {Integrating Several Annotation Layers for Statistical Information Distillation},
author = {Levit, M.  and Hakkani-Tur, D.  and Tur, G.  and Gillick, D. },
journal = {IEEE workshop on Automatic Speech Recognition and Understanding (ASRU 07), Kyoto},
year = {2007},
keywords = {Report_VII, IM2.AP}
}

@incollection{Keshet_WILEY-2_2009,
title = {A Kernel Wrapper for Phoneme Sequence Recognition},
author = {Keshet, J.  and Chazan, D. },
editor = {Keshet, J.  and Bengio, S. },
booktitle = {Automatic Speech and Speaker Recognition: Large Margin and Kernel Methods},
year = {2009},
publisher = {John Wiley and Sons},
keywords = {IM2.AP, Report_VIII},
abstract = {We describe a kernel wrapper, a Mercer kernel for the task of phoneme sequence recognition which is based on operations with the Gaussian kernel, and suitable for any sequence kernel classifier. We start by presenting a kernel-based algorithm for phoneme sequence recognition, which aims at minimizing the Levenshtein distance (edit distance) between the predicted phoneme sequence and the true phoneme sequence. Motivated by the good results of frame-based phoneme classification using SVMs with Gaussian kernel, we devised a kernel for speech utterances and phoneme sequences, which generalizes the kernel function for phoneme frame-based classification and adds timing constraints in the form of transitions and durations constraints. The kernel function has three parts corresponding to phoneme acoustic model, phoneme duration model and phoneme transition model. We present initial encouraging experimental results with the TIMIT corpus.},
projects = {Idiap}
}

@article{Ba:IEEE-SMC-B:2008,
title = {Recognizing visual focus of attention from head pose in natural meetings},
author = {Ba, S.  and Odobez, J. -M. },
journal = {accepted for publication in IEEE Trans. on System, Man and Cybernetics: Part B, Man,},
year = {2008},
keywords = {Report_VII, IM2.VP}
}

@article{Ba:IEEE-SMC-B:2009,
title = {Recognizing human visual focus of attention from head pose in meetings},
author = {Ba, S.  and Odobez, J. -M. },
journal = {IEEE Trans. on System, Man and Cybernetics: part B, Man},
year = {2009},
volume = {39},
number = {1},
pages = {16--34},
keywords = {IM2.VP, Report_VIII}
}

@article{bunke07offlineRoman,
title = {Off-line Roman cursive handwriting recognition},
author = {Bunke, H.  and Varga, T. },
editor = {Chaudhuri, B. },
journal = {Digital Document Processing: Major Directions and Recent Advances},
series = {Advances in Pattern Recognition},
year = {2007},
volume = {20},
pages = {165--173},
publisher = {Springer},
keywords = {Report_VI, IM2.ACP},
peer = {yes}
}

@inproceedings{BLGSSM:nips:2006,
title = {Unified inference for variational bayesian linear gaussian state-space models},
author = {Barber, D.  and Chiappa, S. },
booktitle = {NIPS},
year = {2006},
note = {IDIAP-RR 06-50},
keywords = {Report_VI, IM2.MPR},
abstract = {Linear Gaussian State-Space Models are widely used and a Bayesian treatment of parameters is therefore of considerable interest. The approximate Variational Bayesian method applied to these models is an attractive approach, used successfully in applications ranging from acoustics to bioinformatics. The most challenging aspect of implementing the method is in performing inference on the hidden state sequence of the model. We show how to convert the inference problem so that standard and stable Kalman Filtering/Smoothing recursions from the literature may be applied. This is in contrast to previously published approaches based on Belief Propagation. Our framework both simplifies and unifies the inference problem, so that future applications may be easily developed. We demonstrate the elegance of the approach on Bayesian temporal ICA, with an application to finding independent components in noisy EEG signals.},
ipdmembership = {learning},
ipdxref = {techreport:BLGSSM-idiap-rr-06-50.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2006/BLGSSM-nips-2006.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2006/BLGSSM-nips-2006.ps.gz}
}

@inproceedings{ICTIR2007,
title = {Clustered multidimensional scaling for exploration in information retrieval},
author = {Szekely, E.  and Bruno, E.  and Marchand-Maillet, S. },
booktitle = {International Conference on the Theory of Information Retrieval},
year = {2007},
note = {submitted},
keywords = {Report_VI, IM2.MCA}
}

@techreport{marcel:rr07-14,
title = {Joint bi-modal face and speaker authentication using explicit polynomial expansion},
author = {Marcel, S. },
year = {2007},
type = {IDIAP-RR},
number = {14},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {Report_VI, IM2.MPR},
ipdmembership = {vision},
pdf = {ftp://ftp.idiap.ch/pub/reports/2007/marcel-idiap-rr-07-14.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2007/marcel-idiap-rr-07-14.ps.gz}
}

@techreport{parthasarathi:rr08-23,
title = {A data-driven approach to speech/non-speech detection},
author = {Parthasarathi, S. H. K.  and Hermansky, H. },
year = {2008},
type = {Idiap-RR},
number = {Idiap-RR-23-2008},
institution = {IDIAP},
keywords = {IM2.BMI, Report_VII},
abstract = {We present a data-driven approach to weighting the temporal context of signal energy to be used in a simple speech/non-speech detector (SND). The optimal weights are obtained using linear discriminant analysis (LDA). Regularization is performed to handle numerical issues inherent to the usage of correlated features. The discriminant so obtained is interpreted as a filter in the modulation spectral domain. Experimental evaluations on the test data set, in terms of average frame-level error rate over different SNR levels, show that the proposed method yields an absolute performance gain of $10.9\%$, $17.5\%$, $7.9\%$ and $8.3\%$ over ITU's G.729B, ETSI's AMR1, AMR2 and a state-of-the-art multi-layer perceptron based system, respectively. This shows that even a simple feature such as full-band energy, when employed with a large-enough context, shows promise for applications.}
}

@inproceedings{ferrez_nips_2007,
title = {Eeg-based brain-computer interaction: improved accuracy by automatic single-trial error detection},
author = {Ferrez, P. W.  and Millán, J. del R. },
booktitle = {Advances in Neural Information Processing Systems 20},
year = {2008},
pages = {441--448},
location = {Cambridge, MA},
keywords = {IM2.BCI, Report_VII},
abstract = {Brain-computer interfaces (BCIs), as any other interaction modality based on physiological signals and body channels (e.g., muscular activity, speech and gestures), are prone to errors in the recognition of subject's intent. An elegant approach to improve the accuracy of BCIs consists in a verification procedure directly based on the presence of error-related potentials (ErrP) in the EEG recorded right after the occurrence of an error. Six healthy volunteer subjects with no prior BCI experience participated in a new human-robot interaction experiment where they were asked to mentally move a cursor towards a target that can be reached within a few steps using motor imagination. This experiment confirms the previously reported presence of a new kind of ErrP. These Interaction ErrP exhibit a first sharp negative peak followed by a positive peak and a second broader negative peak ( 290, 350 and 470 ms after the feedback, respectively). But in order to exploit these ErrP we need to detect them in each single trial using a short window following the feedback associated to the response of the classifier embedded in the BCI. We have achieved an average recognition rate of correct and erroneous single trials of 81.8\% and 76.2\%, respectively. Furthermore, we have achieved an average recognition rate of the subject's intent while trying to mentally drive the cursor of 73.1\%. These results show that it's possible to simultaneously extract useful information for mental control to operate a brain-actuated device as well as cognitive states such as error potentials to improve the quality of the brain-computer interaction. Finally, using a well-known inverse model (sLORETA), we show that the main focus of activity at the occurrence of the ErrP are, as expected, in the pre-supplementary motor area and in the anterior cingulate cortex.}
}

@techreport{vinciarelli:rr07-40,
title = {Role recognition in radio programs using social affiliation networks and mixtures of discrete distributions: an approach inspired by social cognition},
author = {Vinciarelli, A.  and Favre, S. },
year = {2007},
type = {Idiap-RR},
number = {Idiap-RR-40-2007},
institution = {IDIAP},
note = {Submitted for publication},
keywords = {IM2.MCA, Report_VII},
abstract = {This paper presents an approach for the recognition of the roles played by speakers participating in radio programs. The approach is inspired by social cognition, i.e. by the way humans make sense of people they do not know, and it includes unsupervised speaker clustering performed with Hidden Markov Models, Social Network Analysis and Mixtures of Bernoulli and Multinomial Distributions. The experiments are performed over two corpora of radio programs for a total of around 45 hours of material. The results show that more than 80 percent of the data time can be labeled correctly in terms of role.}
}

@inproceedings{paugam:esann:2007,
title = {A supervised learning approach based on stdp and polychronization in spiking neuron networks},
author = {Paugam-Moisy, H.  and Martinez, R.  and Bengio, S. },
booktitle = {European Symposium on Artificial Neural Networks, ESANN},
year = {2007},
note = {IDIAP-RR 06-54},
keywords = {Report_VI, IM2.MPR},
abstract = {We propose a network model of spiking neurons, without preimposed topology and driven by STDP (Spike-Time-Dependent Plasticity), a temporal Hebbian unsupervised learning mode, biologically observed. The model is further driven by a supervised learning algorithm, based on a margin criterion, that has effect on the synaptic delays linking the network to the output neurons, with classification as a goal task. The network processing and the resulting performance are completely explainable by the concept of polychronization, proposed by Izhikevich citeIzh06NComp. The model emphasizes the computational capabilities of this concept.},
ipdmembership = {learning},
ipdxref = {techreport:paugam-idiap-rr-06-54.bib},
pdf = {ftp://ftp.idiap.ch/pub/papers/2007/paugam-esann-2007.pdf},
postscript = {ftp://ftp.idiap.ch/pub/papers/2007/paugam-esann-2007.ps.gz}
}

@inproceedings{pop09-anh,
title = {Automatic vs. human question answering over multimedia meeting recordings},
author = {Le, Q. A.  and Popescu-Belis, A. },
booktitle = {Interspeech 2009 (10th Annual Conference of the International Speech Communication Association)},
year = {2009},
keywords = {IM2.HMI, Report_VIII}
}

@inproceedings{Tommasi_BMVC_2009,
title = {The more you know, the less you learn: from knowledge transfer to one-shot learning of object categories},
author = {Tommasi, T.  and Caputo, B. },
booktitle = {BMVC},
year = {2009},
keywords = {IM2.MPR, Report_VIII},
abstract = {Learning a category from few examples is a challenging task for vision algorithms, while psychological studies have shown that humans are able to generalise correctly even from a single instance (one-shot learning). The most accredited hypothesis is that humans are able to exploit prior knowledge when learning a new related category. This paper presents an SVM-based model adaptation algorithm able to perform knowledge transfer for a new category when very limited examples are available. Using a leave- one-out estimate of the weighted error-rate the algorithm automatically decides from where to transfer (on which known category to rely), how much to transfer (the degree of adaptation) and if it is worth transferring something at all. Moreover a weighted least-squares loss function takes optimally care of data unbalance between negative and positive examples. Experiments presented on two different object category databases show that the proposed method is able to exploit previous knowledge avoiding negative transfer. The overall classification performance is increased compared to what would be achieved by starting from scratch. Furthermore as the number of already learned categories grows, the algorithm is able to learn a new category from one sample with increasing precision, i.e. it is able to perform one-shot learning.},
projects = {Idiap,
DIRAC,
EMMA},
}

@book{liwicki08recognition,
title = {Recognition of whiteboard notes -- online, offline and combination},
author = {Liwicki, M.  and Bunke, H. },
year = {2008},
publisher = {World Scientific},
isbn = {978-9812814531},
keywords = {IM2.VP, Report_VIII}
}

@inproceedings{Imseng_ASRU_2009,
title = {Robust Speaker Diarization for Short Speech Recordings},
author = {Imseng, D.  and Friedland, G. },
crossref = {Imseng_Idiap-RR-26-2009},
booktitle = {Proceedings of the IEEE workshop on Automatic Speech Recognition and Understanding},
year = {2009},
location = {Merano, Italy},
keywords = {IM2.AP, IM2.MCA, Report_VIII},
abstract = {We investigate a state-of-the-art Speaker Diarization system regarding its behavior on meetings that are much shorter (from 500 seconds down to 100 seconds) than those typically analyzed in Speaker Diarization benchmarks. First, the problems inherent to this task are analyzed. Then, we propose an approach that consists of a novel initialization parameter estimation method for typical state-of-the-art diarization approaches. The estimation method balances the relationship between the optimal value of the duration of speech data per Gaussian and the duration of the speech data, which is verified experimentally for the first time in this article. As a result, the Diarization Error Rate for short meetings extracted from the 2006, 2007, and 2009 NIST RT evaluation data is decreased by up to 50\% relative.},
projects = {Idiap,
AMIDA,
IM2},
}

@article{Armstrong-1-ISSCO,
title = {Une grammaire partag\'ee multi-t\^ache pour le traitement de la parole : application aux langues romanes},
author = {Bouillon, P.  and Rayner, M.  and Novellas Vall, B.  and Starlander, M.  and Santaholma, M.  and Nakao, Y.  and Chatzichrisafis, N. },
editor = {, Hermes & Lavoisier},
journal = {TAL (Traitement Automatique des Langues)},
year = {2007},
volume = {47},
number = {3},
keywords = {Report_VI, IM2.HMI}
}

@inproceedings{zhang-rr-06-41b,
title = {Exploring contextual information in a layered framework for group action recognition},
author = {Zhang, D.  and Gatica-Perez, D.  and Bengio, S. },
booktitle = {In the Eighth International Conference on Multimodal Interfaces (ICMI'06)},
year = {2006},
note = {IDIAP-RR 06-41},
keywords = {Report_VI, IM2.MPR},
abstract = {Contextual information is important for sequence modeling. Hidden Markov Models (HMMs) and extensions, which have been widely used for sequence modeling, make simplifying, often unrealistic assumptions on the conditional independence of observations given the class labels, thus cannot accommoyearoverlapping features or long-term contextual information. In this paper, we introduce a principled layered framework with three implementation methods that take into account contextual information (as available in the whole or part of the sequence). The first two methods are based on state em alpha and em gamma posteriors (as usually referred to in the HMM formalism). The third method is based on Conditional Random Fields (CRFs), a conditional model that relaxes the independent assumption on the observations required by HMMs for computational tractability. We illustrate our methods with the application of recognizing group actions in meetings. Experiments and comparison with standard HMM baseline showed the validity of the proposed approach.},
ipdmembership = {vision zhang},
ipdxref = {techreport:zhang-rr-06-41.bib},
pdf = {ftp://ftp.idiap.ch/pub/reports/2006/rr-06-41.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2006/rr-06-41.ps.gz}
}

@inproceedings{gonzalez-et-al-2009b,
title = {Steerable features for statistical 3d dendrite detection},
author = {Gonzalez, G.  and Aguet, F.  and Fleuret, F.  and Unser, M.  and Fua, P. },
booktitle = {Proceedings of the International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI)},
year = {2009},
note = {(to appear)},
keywords = {IM2.VP, Report_VIII}
}

@article{hung08,
title = {Estimating the Dominant Person in Multi-Party Conversations Using Speaker Diarization Strategies},
author = {Hung, H.  and Huang, Y.  and Friedland, G.  and Gatica-Perez, D. },
journal = {IEEE ICASSP, Las Vegas, NV},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@article{favre08,
title = {Punctuating speech for information extraction},
author = {Favre, B.  and Grishman, R.  and Hillard, D.  and Ji, H.  and Hakkani-Tur, D.  and Ostendorf, M. },
journal = {IEEE ICASSP, Las Vegas, NV},
year = {2008},
keywords = {Report_VII, IM2.AP}
}

@inproceedings{indermuehle09combining,
title = {Combining alignment results for historical handwritten document analysis},
author = {Indermühle, E.  and Liwicki, M.  and Bunke, H. },
booktitle = {Proc. 10th Int. Conf. on Document Analysis and Recognition},
year = {2009},
volume = {3},
pages = {1186--1190},
isbn = {978-0-7695-3725-2},
keywords = {IM2.VP, Report_VIII},
peer = {yes}
}

@techreport{parthasarathi:rr08-21,
title = {Exploiting temporal context for speech/non-speech detection},
author = {Parthasarathi, S. H. K.  and Motlicek, P.  and Hermansky, H. },
year = {2008},
month = {September},
type = {Idiap-RR},
number = {Idiap-RR-21-2008},
institution = {IDIAP},
keywords = {IM2.AP,Report_VII},
abstract = {In this paper, we investigate the effect of temporal context for speech/non-speech detection (SND). It is shown that even a simple feature such as full-band energy, when employed with a large-enough context, shows promise for further investigation. Experimental evaluations on the test data set, with a state-of-the-art multi-layer perceptron based SND system and a simple energy threshold based SND method, using the F-measure, show an absolute performance gain of $4.4\%$ and $5.4\%$ respectively, when used with a context of 1000 ms. ROC based performance evaluation also reveals promising performance for the proposed method, particularly in low SNR conditions.},
ipdmembership = {speech},
projects = {Idiap},
pdf = {ftp://ftp.idiap.ch/pub/reports/2008/parthasarathi-idiap-rr-08-21.pdf},
postscript = {ftp://ftp.idiap.ch/pub/reports/2008/parthasarathi-idiap-rr-08-21.ps.gz}
}

@inproceedings{orabona:bmvc:2007,
title = {Indoor place recognition using online independent support vector machines},
author = {Orabona, F.  and Castellini, C.  and Caputo, B.  and Luo, J.  and Sandini, G. },
booktitle = {18th British Machine Vision Conference (BMVC07)},
year = {2007},
pages = {1090--1099},
location = {Warwick, UK},
keywords = {IM2.VP, Report_VII},
abstract = {In the framework of indoor mobile robotics, place recognition is a challenging task, where it is crucial that self-localization be enforced precisely, notwithstanding the changing conditions of illumination, objects being shifted around and/or people affecting the appearance of the scene. In this scenario online learning seems the main way out, thanks to the possibility of adapting to changes in a smart and flexible way. Nevertheless, standard machine learning approaches usually suffer when confronted with massive amounts of data and when asked to work online. Online learning requires a high training and testing speed, all the more in place recognition, where a continuous flow of data comes from one or more cameras. In this paper we follow the Support Vector Machines-based approach of Pronobis et al., proposing an improvement that we call Online Independent Support Vector Machines. This technique exploits linear independence in the image feature space to incrementally keep the size of the learning machine remarkably small while retaining the accuracy of a standard machine. Since the training and testing time crucially depend on the size of the machine, this solves the above stated problems. Our experimental results prove the effectiveness of the approach.}
}

@article{hung07,
title = {Using audio and video features to classify the most dominant person in meetings},
author = {Hung, H.  and Jayagopi, D.  and Yeo, C.  and Friedland, G.  and Ba, S.  and Odobez, J. -M.  and Ramchandran, K.  and Mirghafori, N.  and Gatica-Perez, D. },
journal = {Proceedings of ACM Multimedia 2007, pp. 835-838, Augsburg, Germany},
year = {2007},
keywords = {Report_VII, IM2.AP.VP, joint publication}
}

@inproceedings{Liang_ICASSP_2010,
title = {A Comparison of Supervised and Unsupervised Cross-Lingual Speaker Adaptation Approaches for HMM-Based Speech Synthesis},
author = {Liang, H.  and Dines, J.  and Saheer, L. },
crossref = {Liang_Idiap-RR-05-2010},
booktitle = {Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing},
year = {2010},
month = {March},
pages = {4598--4601},
location = {Dallas, U.S.A.},
keywords = {decision tree marginalization, HMM state mapping, unsupervised cross-lingual speaker adaptation, Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {The EMIME project aims to build a personalized speech-to-speech translator, such that spoken input of a user in one language is used to produce spoken output that still sounds like the user's voice however in another language. This distinctiveness makes unsupervised cross-lingual speaker adaptation one key to the project's success. So far, research has been conducted into unsupervised and cross-lingual cases separately by means of decision tree marginalization and HMM state mapping respectively. In this paper we combine the two techniques to perform unsupervised cross-lingual speaker adaptation. The performance of eight speaker adaptation systems (supervised vs. unsupervised, intra-lingual vs. cross-lingual) are compared using objective and subjective evaluations. Experimental results show the performance of unsupervised cross-lingual speaker adaptation is comparable to that of the supervised case in terms of spectrum adaptation in the EMIME scenario, even though automatically obtained transcriptions have a very high phoneme error rate.},
projects = {Idiap,
EMIME},
}

@article{Aran_PR_2010,
title = {A Multi-class Classification Strategy for Fisher Scores: Application to Signer Independent Sign Language Recognition},
author = {Aran, Oya and Akarun, Lale},
journal = {Pattern Recognition},
year = {2010},
month = {May},
volume = {43},
number = {5},
pages = {1776--1788},
doi = {doi:10.1016/j.patcog.2009.12.002},
keywords = {Report_IX, IM2.IP1, Group Bourlard, article},
abstract = {Fisher kernels combine the powers of discriminative and generative classifiers by mapping the variable-length sequences to a new fixed length feature space, called the Fisher score space. The mapping is based on a single generative model and the classifier is intrinsically binary. We propose a multi-class classification strategy that applies a multi-class classification on each Fisher score space and combines the decisions of multi-class classifiers. We experimentally show that the Fisher scores of one class provide discriminative information for the other classes as well. We compare several multi-class classification strategies for Fisher scores generated from the hidden Markov models of sign sequences. The proposed multi-class classification strategy increases the classification accuracy in comparison with the state of the art strategies based on combining binary classifiers. To reduce the computational complexity of the Fisher score extraction and the training phases, we also propose a score space selection method and show that, similar or even higher accuracies can be obtained by using only a subset of the score spaces. Based on the proposed score space selection method, a signer adaptation technique is also presented that does not require any re-training.},
}

@inproceedings{Aran_LREC_2010,
title = {A Multimodal Corpus for Studying Dominance in Small Group Conversations},
author = {Aran, Oya and Hung, H.  and Gatica-Perez, D. },
crossref = {Aran_Idiap-Internal-RR-107-2010},
booktitle = {LREC workshop on Multimodal Corpora: Advances in Capturing, Coding and Analyzing Multimodality, Malta, May 2010},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
projects = {Idiap,
NOVICOM,
IM2,
AMIDA},
}

@inproceedings{Yazdani_IEEE-ICSC2010_2010,
title = {A Random Walk Framework to Compute Textual Semantic Similarity: a Unified Model for Three Benchmark Tasks},
author = {Yazdani, M.  and Popescu-Belis, A. },
booktitle = {Proceedings of the 4th IEEE International Conference on Semantic Computing (ICSC 2010), Carnegie Mellon University, Pittsburgh, PA, USA},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
projects = {Idiap,
IM2},
}

@article{Heusch_IVC_2009,
title = {A novel statistical generative model dedicated to face recognition},
author = {Heusch, G.  and Marcel, S. },
crossref = {heusch:rr07-39},
journal = {Image {\&} Vision Computing},
year = {2009},
note = {in press},
keywords = {Report_IX, IM2.IP1, Group Bourlard, article},
projects = {Idiap,
GMFace},
}

@techreport{Motlicek_Idiap-RR-03-2010,
title = {AMIDA/Klewel Mini-Project},
author = {Motlicek, P.  and Garner, P. N.  and Guillemot, M.  and Bozzo, Vincent},
year = {2010},
month = {January},
type = {Idiap-RR},
number = {Idiap-RR-03-2010},
institution = {Idiap},
address = {Rue Marconi 19, Martigny},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {The goal of the AMIDA mini-project is to transfer some of the technologies developed within the AMIDA project to be used by a Klewel retrieval system. More specifically, the main focus is to develop a speech-to-text application based on the AMIDA Automatic Speech Recognition (ASR) system which could be potentially implemented by Klewel in their conference webcasting system. First, this document describes experimental setup and results achieved in the project devoted to the automatic processing of real lecture recordings provided by Klewel. Then, a demonstrator {\^a}�� an application created for demonstrating Automatic Speech Recognition (ASR) results{\^a}��is described.},
projects = {Idiap,
AMIDA},
}

@inproceedings{Ganapathy_WASPAA2009-2_2009,
title = {APPLICATIONS OF SIGNAL ANALYSIS USING AUTOREGRESSIVE MODELS FOR AMPLITUDE MODULATION},
author = {Ganapathy, S.  and Thomas, S.  and Motlicek, P.  and Hermansky, H. },
booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics, 2009, WASPAA '09.},
year = {2009},
month = {October},
pages = {341--344},
organization = {IEEE},
location = {Mohonk Mountain House, New Paltz, New York, USA},
note = {Digital Object Identifier 10.1109/ASPAA.2009.534649},
url = {http://www.waspaa2009.com},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Frequency Domain Linear Prediction (FDLP) represents an efficient technique for representing the long-term amplitude modulations (AM) of speech/audio signals using autoregressive models. For the proposed analysis technique, relatively long temporal segments (1000 ms) of the input signal are decomposed into a set of sub-bands. FDLP is applied on each sub-band to model the temporal envelopes. The residual of the linear prediction represents the frequency modulations (FM) in the sub-band signal. In this paper, we present several applications of the proposed AM-FM decomposition technique for a variety of tasks like wide-band audio coding, speech recognition in reverberant environments and robust feature extraction for phoneme recognition.},
projects = {Idiap,
AMIDA,
DIRAC,
IM2},
}

@techreport{Vijayasenan_Idiap-RR-23-2010,
title = {Advances in Fast Multistream Diarization based on the Information Bottleneck Framework},
author = {Vijayasenan, D.  and Valente, F.  and Bourlard, H. },
year = {2010},
month = {July},
type = {Idiap-RR},
number = {Idiap-RR-23-2010},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {Multistream diarization is an effective way to improve the diarization performance, MFCC and Time Delay Of Arrivals (TDOA) being the most commonly used features. This paper extends our previous work on information bottleneck diarization aiming to include large number of features besides MFCC and TDOA while keeping computational costs low. At first HMM/GMM and IB systems are compared in case of two and four feature streams and analysis of errors is performed. Results on a dataset of 17 meetings show that, in spite of comparable oracle performances, the IB system is more robust to feature weight variations. Then a sequential optimization is introduced that further improves the speaker error by 5 {\^a}�� 8% relative. In the last part, computational issues are discussed. The proposed approach is significantly faster and its complexity marginally grows with the number of feature streams running in 0.75 realtime even with four streams achieving a speaker error equal to 6%.},
projects = {Idiap,
AMIDA,
IM2},
}

@inproceedings{Imseng_ICASSP_2010,
title = {An Adaptive Initialization Method for Speaker Diarization based on Prosodic Features},
author = {Imseng, D.  and Friedland, G. },
crossref = {Imseng_Idiap-RR-02-2010},
booktitle = {Proceedings IEEE International Conference on Acoustics, Speech and Signal Processing},
year = {2010},
month = {March},
pages = {4946--4949},
location = {Dallas, USA},
keywords = {Gaussian Mixture Models, Prosodic features, Speaker Diarization, Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {The following article presents a novel, adaptive initialization scheme that can be applied to most state-ofthe-art Speaker Diarization algorithms, i.e. algorithms that use agglomerative hierarchical clustering with Bayesian Information Criterion (BIC) and Gaussian Mixture Models (GMMs) of frame-based cepstral features (MFCCs). The initialization method is a combination of the recently proposed {\^a}��adaptive seconds per Gaussian{\^a}�� (ASPG) method and a new pre-clustering and number of initial clusters estimation method based on prosodic features. The presented initialization method has two important advantages. First, the method requires no manual tuning and is robust against file length and speaker count variations. Second, the method outperforms our previously used initialization methods on all benchmark files that were presented in the 2006, 2007, and 2009 NIST Rich Transcription (RT) evaluations and results in a Diarization Error Rate (DER) improvement of up to 67% (relative).},
projects = {Idiap,
IM2,
AMIDA},
}

@inproceedings{Subburaman_ICASSP_2010,
title = {An Alternative Scanning Strategy to Detect Faces},
author = {Subburaman, Venkatesh Bala and Marcel, S. },
booktitle = {Proceedings IEEE International Conference on Acoustics, Speech and Signal Processing},
year = {2010},
location = {Dallas, USA},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {The sliding window approach is the most widely used technique to detect faces in an image. Usually a classifier is applied on a regular grid and to speed up the scanning, the grid spacing is increased, which increases the number of miss detections. In this paper we propose an alternative scanning method which minimizes the number of misses, while improving the speed of detection. To achieve this we use an additional classifier that predicts the bounding box of a face within a local search area. Then a face/non-face classifier is used to verify the presence or absence of a face. We propose a new combination of binary features which we term as u-Ferns for bounding box estimation, which performs comparable or better than former techniques. Experimental evaluation on benchmark database show that we can achieve 15-30% improvement in detection rate or speed when compared to the standard scanning technique.},
projects = {Idiap,
IM2,
MOBIO},
}

@inproceedings{Liang_INTERSPEECH_2010,
title = {An Analysis of Language Mismatch in HMM State Mapping-Based Cross-Lingual Speaker Adaptation},
author = {Liang, H.  and Dines, John},
crossref = {Liang_Idiap-RR-16-2010},
booktitle = {Proceedings of Interspeech},
year = {2010},
month = {September},
location = {Makuhari, Japan},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {This paper provides an in-depth analysis of the impacts of language mismatch on the performance of cross-lingual speaker adaptation. Our work confirms the influence of language mismatch between average voice distributions for synthesis and for transform estimation and the necessity of eliminating this mismatch in order to effectively utilize multiple transforms for cross-lingual speaker adaptation. Specifically, we show that language mismatch introduces unwanted language-specific information when estimating multiple transforms, thus making these transforms detrimental to adaptation performance. Our analysis demonstrates speaker characteristics should be separated from language characteristics in order to improve cross-lingual adaptation performance.},
projects = {Idiap,
EMIME},
}

@techreport{Vijayasenan_Idiap-RR-22-2010,
title = {An Information Theoretic Combination of MFCC and TDOA Features for Speaker Diarization},
author = {Vijayasenan, D.  and Valente, F.  and Bourlard, H. },
year = {2010},
month = {July},
type = {Idiap-RR},
number = {Idiap-RR-22-2010},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {This work describes a novel system for speaker diarization of meetings recordings based on the combination of acoustic features (MFCC) and Time Delay of Arrivals (TDOA). The first part of the paper analyzes differences between MFCC and TDOA features which possess completely different statistical properties. When Gaussian Mixture Models are used, experiments reveal that the diarization system is sensitive to the different recording scenarios (i.e. meeting rooms with varying number of microphones). In the second part, a new multistream diarization system is proposed extending previous work on Information Theoretic diarization. Both speaker clustering and speaker realignment steps are discussed; in contrary to current systems, the proposed method avoids to perform the feature combination averaging log-likelihood scores. Experiments on meetings data reveal that the proposed approach outperforms the GMM based system when the recording is done with varying number of microphones.},
projects = {Idiap,
SNSF-MULTI,
IM2},
}

@techreport{Pronobis_Idiap-RR-30-2009,
title = {Analysis of F0 and Cepstral Features for Robust Automatic Gender Recognition},
author = {Pronobis, M.  and Magimai-Doss, M. },
year = {2009},
month = {November},
type = {Idiap-RR},
number = {Idiap-RR-30-2009},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {In this paper, we analyze applicability of F0 and cepstral features, namely LPCCs, MFCCs, PLPs for robust Automatic Gender Recognition (AGR). Through gender recognition studies on BANCA corpus comprising datasets of varying complexity, we show that use of voiced speech frames and modelling of higher spectral detail (i.e. using higher order cepstral coefficients) along with the use of dynamic features improve the robustness of the system towards mismatched training and test conditions. Moreover, our study shows that for matched clean training and test conditions and for multi-condition training, the AGR system is less sensitive to the order of cepstral coefficients and the use of dynamic features gives little-to-no gain. F0 and cepstral features perform equally well under clean conditions, however under noisy conditions cepstral features yield robust system compared to F0-based system.},
projects = {Idiap,
AMIDA},
}

@article{Pinto_IEEE_TASLP_2010,
title = {Analysis of MLP Based Hierarchical Phoneme Posterior Probability Estimator},
author = {Pinto, J. P.  and Sivaram, G. S. V. S.  and Magimai-Doss, M.  and Hermansky, H.  and Bourlard, H. },
journal = {IEEE Transcations on Audio, Speech, and Language Processing},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Bourlard, article},
abstract = {We analyze a simple hierarchical architecture consisting of two multilayer perceptron (MLP) classifiers in tandem to estimate the phonetic class conditional probabilities. In this hierarchical setup, the first MLP classifier is trained using standard acoustic features. The second MLP is trained using the posterior probabilities of phonemes estimated by the first, but with a long temporal context of around 150-230 ms. Through extensive phoneme recognition experiments, and the analysis of the trained second MLP using Volterra series, we show that (a) the hierarchical system yields higher phoneme recognition accuracies - an absolute improvement of 3.5% and 9.3% on TIMIT and CTS respectively - over the conventional single MLP based system, (b) there exists useful information in the temporal trajectories of the posterior feature space, spanning around 230 ms of context, (c) the second MLP learns the phonetic temporal patterns in the posterior features, which include the phonetic confusions at the output of the first MLP as well as the phonotactics of the language as observed in the training data, and (d) the second MLP classifier requires fewer number of parameters and can be trained using lesser amount of training data.},
projects = {Idiap,
SNSF-KEYSPOT,
IM2},
}

@inproceedings{Asaei_ICASSP_2010,
title = {Analysis of Phone Posterior Feature Space Exploiting Class Specific Sparsity and MLP-based Similarity Measure},
author = {Asaei, Afsaneh and Picart, B.  and Bourlard, H. },
crossref = {Asaei_Idiap-Internal-RR-09-2010},
booktitle = {2010 IEEE International Conference on Acoustics, Speech and Signal Processing},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
projects = {Idiap},
}

@inproceedings{Motlicek_ICASSP2010_2010,
title = {Application of Out-Of-Language Detection To Spoken-Term Detection},
author = {Motlicek, P.  and Valente, F. },
crossref = {Motlicek_Idiap-RR-04-2010},
booktitle = {2010 IEEE International Conference on Acoustics, Speech and Signal Processing},
year = {2010},
month = {April},
location = {Dallas, USA},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {This paper investigates the detection of English spoken terms in a conversational multi-language scenario. The speech is processed using a large vocabulary continuous speech recognition system. The recognition output is represented in the form of word recognition lattices which are then used to search required terms. Due to the potential multi-lingual speech segments at the input, the spoken term detection system is combined with a module performing out-of language detection to adjust its confidence scores. First, experimental results of spoken term detection are provided on the conversational telephone speech database distributed by NIST in 2006. Then, the system is evaluated on a multi-lingual database with and without employment of the out-of-language detection module, where we are only interested in detecting English terms (stored in the index database). Several strategies to combine these two systems in an efficient way are proposed and evaluated. Around 7% relative improvement over a stand-alone STD is achieved},
projects = {Idiap,
AMIDA,
TA2,
IM2},
}

@inproceedings{Chittaranjan_ICASSP2010_2010,
title = {Are you a Werewolf? Detecting deceptive roles and outcomes in a conversational role-playing game},
author = {Chittaranjan, Gokul and Hung, H. },
booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing},
year = {2010},
month = {March},
keywords = {deception, Nonverbal behavior, role analysis, Report_IX, IM2.IP3, Group Bourlard, inproceedings},
abstract = {This paper addresses the task of automatically detecting outcomes of social interaction patterns, using non-verbal audio cues in competi- tive role-playing games (RPGs). For our experiments, we introduce a new data set which features 3 hours of audio-visual recordings of the popular {\^a}��Are you a Werewolf?{\^a}�� RPG. Two problems are ap- proached in this paper: Detecting lying or suspicious behavior using non-verbal audio cues in a social context and predicting participants{\^a}�� decisions in a game-day by analyzing speaker turns. Our best clas- sifier exhibits a performance improvement of 87% over the baseline for detecting deceptive roles. Also, we show that speaker turn based features can be used to determine the outcomes in the initial stages of the game, when the group is large.},
}

@inproceedings{Garau_INTERSPEECH_2010,
title = {Audio-Visual Synchronisation for Speaker Diarisation},
author = {Garau, G.  and Dielmann, A.  and Bourlard, H. },
crossref = {Garau_Idiap-Internal-RR-129-2010},
journal = {Interspeech},
booktitle = {International Conference on Speech and Language Processing, Interspeech},
year = {2010},
month = {September},
location = {Makuhari, Japan},
keywords = {audio-visual speech synchrony, canonical correlation analysis, multimodal speaker diarisation, multiparty meetings, mutual information, Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {The role of audio{\^a}��visual speech synchrony for speaker diarisation is investigated on the multiparty meeting domain. We measured both mutual information and canonical correlation on different sets of audio and video features. As acoustic features we considered energy and MFCCs. As visual features we experimented both with motion intensity features, computed on the whole image, and Kanade Lucas Tomasi motion estimation. Thanks to KLT we decomposed the motion in its horizontal and vertical components. The vertical component was found to be more reliable for speech synchrony estimation. The mutual information between acoustic energy and KLT vertical motion of skin pixels, not only resulted in a 20% relative improvement over a MFCC only diarisation system, but also outperformed visual features such as motion intensities and head poses.},
projects = {Idiap,
AMIDA,
IM2},
}

@techreport{Korchagin_Idiap-RR-39-2009,
title = {Automatic Temporal Alignment of AV Data},
author = {Korchagin, D.  and Garner, P. N.  and Dines, J. },
booktitle = {Automatic Temporal Alignment of AV Data},
year = {2009},
month = {December},
type = {Idiap-RR},
number = {Idiap-RR-39-2009},
institution = {Idiap},
keywords = {audio processing, temporal alignment, time-frequency analysis, Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {In this paper, we describe the automatic audio-based temporal alignment of audio-visual data, recorded by different cameras, camcorders or mobile phones during social events like high school concerts. All recorded data is temporally aligned with a common master track, recorded by a reference camera. The core of the algorithm is based on perceptual time-frequency analysis with a precision of 10 ms. The results show correct alignment in 99% of cases for a real life dataset.},
projects = {Idiap,
TA2},
}

@inproceedings{Korchagin_ICASSP_2010,
title = {Automatic Temporal Alignment of AV Data with Confidence Estimation},
author = {Korchagin, D.  and Garner, P. N.  and Dines, J. },
crossref = {Korchagin_Idiap-RR-40-2009},
booktitle = {Proceedings IEEE International Conference on Acoustics, Speech and Signal Processing},
year = {2010},
month = {March},
location = {Dallas, USA},
address = {P.O. Box 592, CH-1920 Martigny, Switzerland},
keywords = {pattern matching, reliability estimation, time synchronization, time-frequency analysis, Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {In this paper, we propose a new approach for the automatic audio-based temporal alignment with confidence estimation of audio-visual data, recorded by different cameras, camcorders or mobile phones during social events. All recorded data is temporally aligned based on ASR-related features with a common master track, recorded by a reference camera, and the corresponding confidence of alignment is estimated. The core of the algorithm is based on perceptual time-frequency analysis with a precision of 10 ms. The results show correct alignment in 99% of cases for a real life dataset and surpass the performance of cross correlation while keeping lower system requirements.},
projects = {Idiap,
TA2},
}

@article{Ganapathy_IEEETASLP_2010,
title = {Autoregressive Models of Amplitude Modulations in Audio Compression},
author = {Ganapathy, S.  and Motlicek, P.  and Hermansky, H. },
crossref = {Ganapathy_Idiap-RR-33-2009},
journal = {IEEE TRANSACTIONS ON AUDIO, SPEECH, AND LANGUAGE PROCESSING},
year = {2010},
url = {http://www.signalprocessingsociety.org/publications/periodicals/taslp/},
keywords = {Report_IX, IM2.IP1, Group Bourlard, article},
abstract = {We present a scalable medium bit-rate wide-band audio coding technique based on frequency domain linear prediction (FDLP). FDLP is an efficient method for representing the long-term amplitude modulations of speech/audio signals using autoregressive models. For the proposed audio codec, relatively long temporal segments (1000 ms) of the input audio signal are decomposed into a set of critically sampled sub-bands using a quadrature mirror filter (QMF) bank. The technique of FDLP is applied on each sub-band to model the sub-band temporal envelopes. The residual of the linear prediction, which represents the frequency modulations in the sub-band signal [1], are encoded and transmitted along with the envelope parameters. These steps are reversed at the decoder to reconstruct the signal. The proposed codec utilizes a simple signal independent non-adaptive compression mechanism for a wide class of speech and audio signals. The subjective and objective quality evaluations show that the reconstruction signal quality for the proposed FDLP codec compares well with the state-of-the-art audio codecs in the 32-64 kbps range.},
projects = {Idiap},
}

@inproceedings{Roy_ICASSP2010_2010,
title = {BOOSTED BINARY FEATURES FOR NOISE-ROBUST SPEAKER VERIFICATION},
author = {Roy, A.  and Magimai-Doss, M.  and Marcel, S. },
crossref = {Marcel_Idiap-Internal-RR-101-2009},
booktitle = {2010 IEEE International Conference on Acoustics, Speech and Signal Processing},
year = {2010},
month = {March},
location = {Dallas, Texas},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {The standard approach to speaker verification is to extract cepstral features from the speech spectrum and model them by generative or discriminative techniques. We propose a novel approach where a set of client-specific binary features carrying maximal discriminative information specific to the individual client are estimated from an ensemble of pair-wise comparisons of frequency components in magnitude spectra, using Adaboost algorithm. The final classifier is a simple linear combination of these selected features. Experiments on the XM2VTS database strictly according to a standard evaluation protocol have shown that although the proposed framework yields comparatively lower performance on clean speech, it significantly outperforms the state-of-the-art MFCC-GMM system in mismatched conditions with training on clean speech and testing on speech corrupted by four types of additive noise from the standard Noisex-92 database.},
projects = {Idiap,
IM2,
MOBIO,
SNSF-MULTI},
}

@phdthesis{Heusch_THESIS_2009,
title = {Bayesian Networks as Generative Models for Face Recognition},
author = {Heusch, G. },
year = {2009},
school = {EPFL},
keywords = {Report_IX, IM2.IP1, Group Bourlard, phdthesis},
projects = {Idiap,
GMFace,
MOBIO},
}

@inproceedings{Heusch_ICB_2009,
title = {Bayesian Networks to Combine Intensity and Color Information in Face Recognition},
author = {Heusch, G.  and Marcel, S. },
crossref = {Heusch_Idiap-RR-27-2009},
booktitle = {International Conference on Biometrics},
series = {Lectures Notes on Computer Science},
year = {2009},
volume = {5558},
pages = {414--423},
publisher = {Springer},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
projects = {Idiap,
GMFace},
}

@article{Kumatani_ASLP_2009a,
title = {Beamforming with a Maximum Negentropy Criterion},
author = {Kumatani, K.  and McDonough, J.  and Rauch, Barbara and Klakow, D.  and Garner, P. N.  and Li, Weifeng},
crossref = {kumatani:rr08-29},
journal = {IEEE Transactions on Audio Speech and Language Processing},
year = {2009},
month = {July},
volume = {17},
number = {5},
pages = {994--1008},
keywords = {Report_IX, IM2.IP1, Group Bourlard, article},
abstract = {In this paper, we address a beamforming application based on the capture of far-field speech data from a single speaker in a real meeting room. After the position of the speaker is estimated by a speaker tracking system, we construct a subband-domain beamformer in generalized sidelobe canceller (GSC) configuration. In contrast to conventional practice, we then optimize the active weight vectors of the GSC so as to obtain an output signal with maximum negentropy (MN). This implies the beamformer output should be as non-Gaussian as possible. For calculating negentropy, we consider the {\^I}� and the generalized Gaussian (GG) pdfs. After MN beamforming, Zelinski post- filtering is performed to further enhance the speech by remov- ing residual noise. Our beamforming algorithm can suppress noise and reverberation without the signal cancellation problems encountered in the conventional beamforming algorithms. We demonstrate this fact through a set of acoustic simulations. More- over, we show the effectiveness of our proposed technique through a series of far-field automatic speech recognition experiments on the Multi-Channel Wall Street Journal Audio Visual Corpus (MC- WSJ-AV), a corpus of data captured with real far-field sensors, in a realistic acoustic environment, and spoken by real speakers. On the MC-WSJ-AV evaluation data, the delay-and-sum beamformer with post-filtering achieved a word error rate (WER) of 16.5%. MN beamforming with the {\^I}� pdf achieved a 15.8% WER, which was further reduced to 13.2% with the GG pdf, whereas the simple delay-and-sum beamformer provided a WER of 17.8%. To the best of our knowledge, no lower error rates at present have been reported in the literature on this ASR task.},
projects = {AMIDA},
}

@inproceedings{Vinciarelli_ACII_2009,
title = {Canal9: A database of political debates for analysis of social interactions},
author = {Vinciarelli, A.  and Dielmann, A.  and Favre, S.  and Salamin, H. },
booktitle = {Proceedings of the International Conference on Affective Computing and Intelligent Interaction (IEEE International Workshop on Social Signal Processing)},
year = {2009},
month = {September},
pages = {1--4},
location = {Amsterdam, Netherlands},
isbn = {978-1-4244-4800-5},
note = {Publication Date: 10-12 Sept. 2009},
doi = {10.1109/acii.2009.5349466},
keywords = {Report_IX, IM2.IP3, Group Bourlard, inproceedings},
abstract = {Automatic analysis of social interactions attracts major attention in the computing community, but relatively few benchmarks are available to researchers active in the domain. This paper presents a new, publicly available, corpus of political debates including not only raw data, but a rich set of socially relevant annotations such as turn-taking (who speaks when and how much), agreement and disagreement between participants, and role played by people involved in each debate. The collection includes 70 debates for a total of 43 hours and 10 minutes of material.},
projects = {Idiap,
SSPNet},
}

@inproceedings{Roy_ICPR2010_2010,
title = {Crossmodal Matching of Speakers using Lip and Voice Features in Temporally Non-overlapping Audio and Video Streams},
author = {Roy, A.  and Marcel, S. },
crossref = {Roy_Idiap-RR-13-2010},
booktitle = {20th International Conference on Pattern Recognition, Istanbul, Turkey},
year = {2010},
month = {April},
organization = {International Association for Pattern Recognition (IAPR)},
location = {Istanbul, Turkey},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Person identification using audio (speech) and visual (facial appearance, static or dynamic) modalities, either independently or jointly, is a thoroughly investigated problem in pattern recognition. In this work, we explore a novel task : person identification in a cross-modal scenario, i.e., matching the speaker in an audio recording to the same speaker in a video recording, where the two recordings have been made during different sessions, using speaker specific information which is common to both the audio and video modalities. Several recent psychological studies have shown how humans can indeed perform this task with an accuracy significantly higher than chance. Here we propose two systems which can solve this task comparably well, using purely pattern recognition techniques. We hypothesize that such systems could be put to practical use in multimodal biometric and surveillance systems.},
projects = {Idiap,
MOBIO,
SNSF-MULTI},
}

@inproceedings{Motlicek_INTERSPEECH2010_2010,
title = {English Spoken Term Detection in Multilingual Recordings},
author = {Motlicek, P.  and Valente, F.  and Garner, P. N. },
crossref = {Motlicek_Idiap-RR-21-2010},
booktitle = {Proceedings of Interspeech, Makuhari, Japan, 2010},
year = {2010},
month = {September},
organization = {ISCA},
location = {Makuhari, Japan},
keywords = {Confidence Measure (CM), LVCSR, Out-Of-Language (OOL) detection, Spoken Term Detection (STD), Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {This paper investigates the automatic detection of English spoken terms in a multi-language scenario over real lecture recordings. Spoken Term Detection (STD) is based on an LVCSR where the output is represented in the form of word lattices. The lattices are then used to search the required terms. Processed lectures are mainly composed of English, French and Italian recordings where the language can also change within one recording. Therefore, the English STD system uses an Out-Of-Language (OOL) detection module to filter out non-English input segments. OOL detection is evaluated w.r.t. various confidence measures estimated from word lattices. Experimental studies of OOL detection followed by English STD are performed on several hours of multilingual recordings. Significant improvement of OOL STD over a stand-alone STD system is achieved (relatively more than 50% in EER). Finally, an additional modality (text slides in the form of PowerPoint presentations) is exploited to improve STD.},
projects = {Idiap,
IM2,
TA2},
}

@techreport{Hung_Idiap-RR-12-2010,
title = {Estimating Cohesion in Small Groups using Audio-Visual Nonverbal Behavior},
author = {Hung, H.  and Gatica-Perez, D. },
year = {2010},
month = {June},
type = {Idiap-RR},
number = {Idiap-RR-12-2010},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {Cohesiveness in teams is an essential part of ensuring the smooth running of task-oriented groups. Research in social psychology and management has shown that good cohesion in groups can be correlated with team effectiveness or productivity so automatically estimating group cohesion for team training can be a useful tool. This paper addresses the problem of analyzing group behavior within the context of cohesion. 4 hours of audio-visual group meeting data was used for collecting annotations on the cohesiveness of 4-participant teams. We propose a series of audio and video features, which are inspired by findings in the social sciences literature. Our study is validated on as set of 61 2-minute meeting segments which showed high agreement amongst human annotators who were asked to identify meetings which have high or low cohesion.},
projects = {Idiap,
IM2,
AMIDA},
}

@article{Hung_IEEETRANS.ASL_2010,
title = {Estimating Dominance in Multi-Party Meetings Using Speaker Diarization},
author = {Hung, H.  and Huang, Y.  and Friedland, G.  and Gatica-Perez, D. },
journal = {IEEE Transactions on Audio, Speech, and Language Processing},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Bourlard, article},
abstract = {With the increase in cheap commercially available sensors, recording meetings is becoming an increasingly practical option. With this trend comes the need to summarize the recorded data in semantically meaningful ways. Here, we investigate the task of automatically measuring dominance in small group meetings when only a single audio source is available. Past research has found that speaking length as a single feature, provides a very good estimate of dominance. For these tasks we use speaker segmentations generated by our automated faster than real-time speaker diarization algorithm, where the number of speakers is not known beforehand. From user-annotated data, we analyze how the inherent variability of the annotations affects the performance of our dominance estimation method. We primarily focus on examining of how the performance of the speaker diarization and our dominance tasks vary under different experimental conditions and computationally efficient strategies, and how this would impact on a practical implementation of such a system. Despite the use of a state-of-the-art speaker diarization algorithm, speaker segments can be noisy. On conducting experiments on almost 5 hours of audio-visual meeting data, our results show that the dominance estimation is robust to increasing diarization noise.},
projects = {Idiap,
AMIDA},
}

@inproceedings{Parthasarathi_PROCEEDINGSOFICASSP2010_2010,
title = {Evaluating the Robustness of Privacy-Sensitive Audio Features for Speech Detection in Personal Audio Log Scenarios},
author = {Parthasarathi, S. H. K.  and Magimai-Doss, M.  and Bourlard, H.  and Gatica-Perez, D. },
booktitle = {ICASSP 2010},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Personal audio logs are often recorded in multiple environments. This poses challenges for robust front-end processing, including speech/nonspeech detection (SND). Motivated by this, we investigate the robustness of four different privacy-sensitive features for SND, namely energy, zero crossing rate, spectral flatness, and kurtosis. We study early and late fusion of these features in conjunction with modeling temporal context. These combinations are evaluated in mismatched conditions on a dataset of nearly 450 hours. While both combinations yield improvements over individual features, generally feature combinations perform better. Comparisons with a state-of-the-art spectral based and a privacy-sensitive feature set are also provided.},
projects = {Idiap,
SNSF-MULTI},
}

@inproceedings{Berclaz_PETS_2009,
title = {Evaluation of Probabilistic Occupancy Map People Detection for Surveillance Systems},
author = {Berclaz, J.  and Shahrokni, A.  and Fleuret, F.  and Ferryman, James and Fua, P. },
booktitle = {Proceedings of the IEEE International Workshop on Performance Evaluation of Tracking and Surveillance},
year = {2009},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {In this paper, we evaluate the Probabilistic Occupancy Map (POM) pedestrian detection algorithm on the PETS 2009 benchmark dataset. POM is a multi-camera generative detection method, which estimates ground plane occupancy from multiple background subtraction views. Occupancy probabilities are iteratively estimated by fitting a synthetic model of the background subtraction to the binary foreground motion. Furthermore, we test the integration of this algorithm into a larger framework designed for understanding human activities in real environments. We demonstrate accurate detection and localization on the PETS dataset, despite suboptimal calibration and foreground motion segmentation input.},
projects = {Idiap}
}

@techreport{Popescu-Belis_Idiap-Com-01-2010,
title = {Finding without searching},
author = {Popescu-Belis, A. },
year = {2010},
month = {January},
type = {Idiap-Com},
number = {Idiap-Com-01-2010},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
projects = {Idiap,
AMIDA,
IM2},
}

@techreport{Negoescu_Idiap-RR-18-2010,
title = {Flickr Groups: Multimedia Communities for Multimedia Analysis},
author = {Negoescu, R. -A.  and Gatica-Perez, D. },
editor = {Hua, Xian-Sheng and Worring, Marcel and Chua, Tat-Seng},
crossref = {Negoescu_Idiap-Internal-RR-12-2010},
year = {2010},
month = {July},
type = {Idiap-RR},
number = {Idiap-RR-18-2010},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
projects = {SNSF-MULTI},
}

@inproceedings{Negoescu_ACMMM09_2009,
title = {Flickr Hypergroups},
author = {Negoescu, R. -A.  and Adams, B.  and Phung, D.  and Venkatesh, S.  and Gatica-Perez, D. },
crossref = {Idiap-Internal-RR-73-2009},
booktitle = {Proceedings of the 17th ACM International Conference on Multimedia},
year = {2009},
month = {October},
keywords = {Flickr groups LDA, Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {The amount of multimedia content available online constantly increases, and this leads to problems for users who search for content or similar communities. Users in Flickr often self-organize in user communities through Flickr Groups. These groups are particularly interesting as they are a natural instantiation of the content~ ~relations social media paradigm. We propose a novel approach to group searching through hypergroup discovery. Starting from roughly 11,000 Flickr groups' content and membership information, we create three different bag-of-word representations for groups, on which we learn probabilistic topic models. Finally, we cast the hypergroup discovery as a clustering problem that is solved via probabilistic affinity propagation. We show that hypergroups so found are generally consistent and can be described through topic-based and similarity-based measures. Our proposed solution could be relatively easy implemented as an application to enrich Flickr's traditional group search.},
projects = {Idiap,
SNSF-MULTI},
}

@inproceedings{Dielmann_INTERSPEECH,
title = {Floor Holder Detection and End of Speaker Turn Prediction in Meetings},
crossref = {Idiap-Internal-RR-127-2010},
booktitle = {International Conference on Speech and Language Processing, Interspeech},
year = {2010},
month = {September},
publisher = {ISCA},
location = {Makuhari, Japan},
keywords = {Dynamic Bayesian Network, floor control, Multiparty Conversation, non-verbal features, speaker turn, Report_IX, IM2.IP3, Group Bourlard, inproceedings},
abstract = {We propose a novel fully automatic framework to detect which meeting participant is currently holding the conversational floor and when the current speaker turn is going to finish. Two sets of experiments were conducted on a large collection of multiparty conversations: the AMI meeting corpus. Unsupervised speaker turn detection was performed by post-processing the speaker diarization and the speech activity detection outputs. A supervised end-of-speaker-turn prediction framework, based on Dynamic Bayesian Networks and automatically extracted multimodal features (related to prosody, overlapping speech, and visual motion), was also investigated. These novel approaches resulted in good floor holder detection rates (13:2% Floor Error Rate), attaining state of the art end-of-speaker-turn prediction performances.},
author = {Dielmann,
Alfred and Garau,
Giulia and Bourlard,
Hervé},
projects = {Idiap,
IM2},
}

@inproceedings{Aran_ICPR2010_2010,
title = {Fusing Audio-Visual Nonverbal Cues to Detect Dominant People in Conversations},
author = {Aran, Oya and Gatica-Perez, D. },
crossref = {Aran_Idiap-RR-17-2010},
booktitle = {20th International Conference on Pattern Recognition, Istanbul, Turkey, 2010},
year = {2010},
month = {August},
location = {Istanbul, Turkey},
keywords = {Report_IX, IM2.IP3, Group Bourlard, inproceedings},
abstract = {This paper addresses the multimodal nature of social dominance and presents multimodal fusion techniques to combine audio and visual nonverbal cues for dominance estimation in small group conversations. We combine the two modalities both at the feature extraction level and at the classifier level via score and rank level fusion. The classification is done by a simple rule-based estimator. We perform experiments on a new 10-hour dataset derived from the popular AMI meeting corpus. We objectively evaluate the performance of each modality and each cue alone and in combination. Our results show that the combination of audio and visual cues is necessary to achieve the best performance.},
projects = {Idiap,
NOVICOM},
}

@inproceedings{Roy_BMVC2009_2009,
title = {Haar Local Binary Pattern Feature for Fast Illumination Invariant Face Detection},
author = {Roy, A.  and Marcel, S. },
crossref = {Roy_Idiap-RR-28-2009},
booktitle = {British Machine Vision Conference 2009},
year = {2009},
month = {September},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Face detection is the first step in many visual processing systems like face recognition, emotion recognition and lip reading. In this paper, we propose a novel feature called Haar Local Binary Pattern (HLBP) feature for fast and reliable face detection, particularly in adverse imaging conditions. This binary feature compares bin values of Local Binary Pattern histograms calculated over two adjacent image subregions. These subregions are similar to those in the Haar masks, hence the name of the feature. They capture the region-specific variations of local texture patterns and are boosted using AdaBoost in a framework similar to that proposed by Viola and Jones. Preliminary results obtained on several standard databases show that it competes well with other face detection systems, especially in adverse illumination conditions.},
projects = {Idiap,
MOBIO,
SNSF-MULTI},
}

@inproceedings{Korchagin_INTERSPEECH_2010,
title = {Hands Free Audio Analysis from Home Entertainment},
author = {Korchagin, D.  and Garner, P. N.  and Motlicek, P. },
crossref = {Korchagin_Idiap-RR-27-2010},
booktitle = {Proceedings of Interspeech},
year = {2010},
month = {September},
location = {Makuhari, Japan},
keywords = {direction of arrival, real-time audio processing, speech meta-data, Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {In this paper, we describe a system developed for hands free audio analysis for a living room environment. It comprises detection and localisation of the verbal and paralinguistic events, which can augment the behaviour of virtual director and improve the overall experience of interactions between spatially separated families and friends. The results show good performance in reverberant environments and fulfil real-time requirements.},
projects = {Idiap,
TA2},
}

@inproceedings{Imseng_INTERSPEECH_2010,
title = {Hierarchical Multilayer Perceptron based Language Identification},
author = {Imseng, D.  and Magimai-Doss, M.  and Bourlard, H. },
crossref = {Imseng_Idiap-RR-14-2010},
booktitle = {Proceedings of Interspeech},
year = {2010},
month = {September},
location = {Makuhari, Japan},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Automatic language identification (LID) systems generally exploit acoustic knowledge, possibly enriched by explicit language specific phonotactic or lexical constraints. This paper investigates a new LID approach based on hierarchical multilayer perceptron (MLP) classifiers, where the first layer is a universal phoneme set MLP classifier''. The resulting (multilingual) phoneme posterior sequence is fed into a second MLP taking a larger temporal context into account. The second MLP can learn/exploit implicitly different types of patterns/information such as confusion between phonemes and/or phonotactics for LID. We investigate the viability of the proposed approach by comparing it against two standard approaches which use phonotactic and lexical constraints with the universal phoneme set MLP classifier as emission probability estimator. On SpeechDat(II) datasets of five European languages, the proposed approach yields significantly better performance compared to the two standard approaches.},
projects = {Idiap,
SNSF-MULTI,
IM2},
}

@inproceedings{Saheer_ISCASPEECHSYNTHESISWORKSHOP(SSW7)_2010,
title = {Implementation of VTLN for Statistical Speech Synthesis},
author = {Saheer, L.  and Dines, J.  and Garner, P. N.  and Liang, H. },
crossref = {Saheer_Idiap-RR-32-2010},
booktitle = {Proceedings of ISCA Speech Synthesis Workshop},
year = {2010},
month = {September},
location = {Kyoto, Japan},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Vocal tract length normalization is an important feature normalization technique that can be used to perform speaker adaptation when very little adaptation data is available. It was shown earlier that VTLN can be applied to statistical speech synthesis and was shown to give additive improvements to CMLLR. This paper presents an EM optimization for estimating more accurate warping factors. The EM formulation helps to embed the feature normalization in the HMM training. This helps in estimating the warping factors more efficiently and enables the use of multiple (appropriate) warping factors for different state clusters of the same speaker.},
projects = {Idiap,
EMIME},
}

@article{Bogdan_MMTOOLS&APPLICATIONS_2010,
title = {Inferring competitive role patterns in reality TV show through nonverbal analysis},
author = {Bogdan, R.  and Gatica-Perez, D. },
journal = {Multimedia Tools and Applications, Special issue on Social Media},
year = {2010},
keywords = {Report_IX, IM2.IP3, Group Bourlard, article},
abstract = {This paper introduces a new facet of social media, namely that depicting social interaction. More concretely, we address this problem from the perspective of nonverbal behavior-based analysis of competitive meetings. For our study, we made use of {\^a}��The Apprentice{\^a}�� reality TV show, which features a competition for a real, highly paid corporate job. Our analysis is centered around two tasks regarding a person{\^a}��s role in a meeting: predicting the person with the highest status, and predicting the fired candidates. We address this problem by adopting both supervised and unsupervised strategies. The current study was carried out using nonverbal audio cues. Our approach is based only on the nonverbal interaction dynamics during the meeting without relying on the spoken words. The analysis is based on two types of data: individual and relational measures. Results obtained from the analysis of a full season of the show are promising (up to 85.7% of accuracy in the first case and up to 92.8% in the second case). Our approach has been conveniently compared with the Influence Model, demonstrating its superiority.},
projects = {Idiap},
}

@inproceedings{Roy_BTAS2010_2010,
title = {Introducing Crossmodal Biometrics:Person Identification from Distinct Audio {\&} Visual Streams},
author = {Roy, A.  and Marcel, S. },
booktitle = {IEEE Fourth International Conference on Biometrics: Theory, Applications and Systems},
year = {2010},
number = {4},
keywords = {audio and video classification, audio-visual speaker recognition, crossmodal matching, Multimodal biometrics, Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Person identification using audio or visual biometrics is a well-studied problem in pattern recognition. In this scenario, both training and testing are done on the same modalities. However, there can be situations where this condition is not valid, i.e. training and testing has to be done on different modalities. This could arise, for example, in covert surveillance. Is there any person specific information common to both the audio and visual (video-only) modalities which could be exploited to identify a person in such a constrained situation? In this work, we investigate this question in a principled way and propose a framework which can perform this task consistently better than chance, suggesting that such crossmodal biometric information exists.},
projects = {Idiap,
IM2,
MOBIO,
SNSF-MULTI},
}

@techreport{Asaei_Idiap-RR-11-2010,
title = {Investigation of kNN Classifier on Posterior Features Towards Application in Automatic Speech Recognition},
author = {Asaei, Afsaneh and Bourlard, H.  and Picart, B. },
year = {2010},
month = {June},
type = {Idiap-RR},
number = {Idiap-RR-11-2010},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {Class posterior distributions can be used to classify or as intermediate features, which can be further exploited in different classifiers (e.g., Gaussian Mixture Models, GMM) towards improving speech recognition performance. In this paper we examine the possibility to use kNN classifier to perform local phonetic classification of class posterior distribution extracted from acoustic vectors. In that framework, we also propose and evaluate a new kNN metric based on the relative angle between feature vectors to define the nearest neighbors. This idea is inspired by the orthogonality characteristic of the posterior features. To fully exploit this attribute, kNN is used in two main steps: (1) the distance is computed as the cosine function of the relative angle between the test vector and the training vector and (2) the nearest neighbors are defined as the samples within a specific relative angle to the test data and the test samples which do not have enough labels in such a hyper-cone are considered as uncertainties and left undecided. This approach is evaluated on TIMIT database and compared to other metrics already used in literature for measuring the similarity between posterior probabilities. Based on our experiments, the proposed approach yield 78.48% frame level accuracy while specifying 15.17% uncertainties in the feature space.},
}

@techreport{Negoescu_Idiap-RR-20-2010,
title = {Kodak Moments and Flickr Diamonds: How Users Shape Large-scale Media},
author = {Negoescu, R. -A.  and Loui, Alexander and Gatica-Perez, D. },
year = {2010},
month = {July},
type = {Idiap-RR},
number = {Idiap-RR-20-2010},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {In today's age of digital multimedia deluge, a clear understanding of the dynamics of online communities is capital. Users have abandoned their role of passive consumers and are now the driving force behind large-scale media repositories, whose dynamics and shaping factors are not yet fully understood. In this paper we present a novel human-centered analysis of two major photo sharing websites, Flickr and Kodak Gallery. On a combined dataset of over 5 million tagged photos, we investigate fundamental differences and similarities at the level of tag usage and propose a joint probabilistic topic model to provide further insight into semantic differences between the two communities. Our results show that the effects of the users' motivations and needs can be strongly observed in this large-scale data, in the form of what we call Kodak Moments and Flickr Diamonds. They are an indication that system designers should carefully take into account the target audience and its needs.},
}

@inproceedings{Ricci_ICIP_2009,
title = {Learning Large Margin Likelihood for Realtime Head Pose Tracking},
author = {Ricci, E.  and Odobez, J. -M. },
booktitle = {IEEE Int. Conference on Image Processing, Cairo, Egypt},
year = {2009},
month = {October},
organization = {IEEE},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
projects = {Idiap,
AMIDA,
TA2,
IM2},
}

@inproceedings{Farrahi_ICMI-MLMI_2009,
title = {Learning and Predicting Multimodal Daily Life Patterns from Cell Phones},
author = {Farrahi, K.  and Gatica-Perez, D. },
booktitle = {ICMI-MLMI},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group Bourlard, inproceedings},
abstract = {In this paper, we investigate the multimodal nature of cell phone data in terms of discovering recurrent and rich patterns in people's lives. We present a method that can discover routines from multiple modalities (location and proximity) jointly modeled, and that uses these informative routines to predict unlabeled or missing data. Using a joint representation of location and proximity data over approximately 10 months of 97 individuals' lives, Latent Dirichlet Allocation is applied for the unsupervised learning of topics describing people's most common locations jointly with the most common types of interactions at these locations. We further successfully predict where and with how many other individuals users will be, for people with both highly and lowly varying lifestyles.},
projects = {Idiap,
SNSF-MULTI},
}

@inproceedings{Ganapathy_AES2009_2009,
title = {MDCT for Encoding Residual Signals in Frequency Domain Linear Prediction},
author = {Ganapathy, S.  and Motlicek, P.  and Hermansky, H. },
crossref = {Ganapathy_Idiap-RR-34-2009},
booktitle = {Audio Engineering Society (AES), 127th Convention},
series = {127th Convention},
year = {2009},
month = {October},
number = {Preprint 7921},
publisher = {Audio Engineering Society, 60 East 42nd Street, New York, New York 10165-2520, USA;},
organization = {Audio Engineering Society (AES)},
url = {http://www.aes.org/events/127/},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Frequency domain linear prediction (FDLP) uses autoregressive models to represent Hilbert envelopes of relatively long segments of speech/audio signals. Although the basic FDLP audio codec achieves good quality of the reconstructed signal at high bit-rates, there is a need for scaling to lower bit-rates without degrading the reconstruction quality. Here, we present a method for improving the compression efficiency of the FDLP codec by the application of the modified discrete cosine transform (MDCT) for encoding the FDLP residual signals. In the subjective and objective quality evaluations, the proposed FDLP codec provides competent quality of reconstructed signal compared to the state-of-the-art audio codecs for the 32 {\^a}�� 64 kbps range.},
projects = {Idiap},
}

@inproceedings{Pinto_ASRU_2009,
author = {Pinto, J. P.  and Magimai-Doss, M.  and Bourlard, H. },
booktitle = {Proceedings of the IEEE workshop on Automatic Speech Recognition and Understanding},
year = {2009},
month = {December},
pages = {365--370},
location = {Merano, Italy},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {We investigate a multilayer perceptron (MLP) based hierarchical approach for task adaptation in automatic speech recognition. The system consists of two MLP classifiers in tandem. A well-trained MLP available off-the-shelf is used at the first stage of the hierarchy. A second MLP is trained on the posterior features estimated by the first, but with a long temporal context of around 130 ms. By using an MLP trained on 250 hours of conversational telephone speech, the hierarchical adaptation approach yields a word error rate of 1.8% on the 600-word Phonebook isolated word recognition task. This compares favorably to the error rate of 4% obtained by the conventional single MLP based system trained with the same amount of Phonebook data that is used for adaptation. The proposed adaptation scheme also benefits from the ability of the second MLP to model the temporal information in the posterior features.},
projects = {Idiap,
SNSF-KEYSPOT,
IM2},
}

@techreport{McCool_Idiap-Com-02-2009,
title = {MOBIO Database for the ICPR 2010 Face and Speech Competition},
author = {McCool, C.  and Marcel, S. },
year = {2009},
month = {November},
type = {Idiap-Com},
number = {Idiap-Com-02-2009},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {This document presents an overview of the mobile biometry (MOBIO) database. This document is written expressly for the face and speech organised for the 2010 International Conference on Pattern Recognition.},
projects = {Idiap,
MOBIO},
}

@techreport{Marcel_Idiap-RR-31-2010,
title = {MOBIO: Mobile Biometric Face and Speaker Authentication},
author = {Marcel, S.  and McCool, C.  and Atanasoaei, Cosmin and Tarsetti, Flavio and Pesan, Jan and Matejka, Pavel and Cernocky, Jan and Helistekangas, Mika and Turtinen, Markus},
year = {2010},
month = {August},
type = {Idiap-RR},
number = {Idiap-RR-31-2010},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {This paper presents a mobile biometric person authentication demonstration system. It consists of verifying a user's claimed identity by biometric means and more particularly using their face and their voice simultaneously on a Nokia N900 mobile device with its built-in sensors (frontal video camera and microphone).},
projects = {MOBIO},
}

@inproceedings{Friedland_ICASSP_2009,
title = {MULTI-MODAL SPEAKER DIARIZATION OF REAL-WORLD MEETINGS USING COMPRESSED-DOMAIN VIDEO FEATURES},
author = {Friedland, G.  and Hung, H.  and Yeo, Chuohao},
booktitle = {International Conference on Audio, Speech and Signal Processing},
year = {2009},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Speaker diarization is originally defined as the task of de- termining {\^a}��who spoke when{\^a}�� given an audio track and no other prior knowledge of any kind. The following article shows a multi-modal approach where we improve a state- of-the-art speaker diarization system by combining standard acoustic features (MFCCs) with compressed domain video features. The approach is evaluated on over 4.5 hours of the publicly available AMI meetings dataset which contains challenges such as people standing up and walking out of the room. We show a consistent improvement of about 34 % rela- tive in speaker error rate (21 % DER) compared to a state-of- the-art audio-only baseline.},
projects = {Idiap,
IM2},
}

@incollection{Popescu-Belis_ELSEVIER_2009,
title = {Managing Multimodal Data, Metadata and Annotations: Challenges and Solutions},
author = {Popescu-Belis, A. },
editor = {Thiran, J. -Ph.  and Marques, F.  and Bourlard, H. },
booktitle = {Multimodal Signal Processing for Human-Computer Interaction},
year = {2009},
pages = {183--203},
publisher = {Elsevier / Academic Press},
keywords = {Report_IX, IM2.IP1, Group Bourlard, incollection},
projects = {Idiap,
IM2,
AMIDA}
}

@inproceedings{Korchagin_UCM_2009,
title = {Memoirs of Togetherness from Audio Logs},
author = {Korchagin, D. },
crossref = {Korchagin_Idiap-RR-36-2009},
booktitle = {Proceedings International ICST Conference on User Centric Media},
year = {2009},
month = {December},
location = {Venice, Italy},
address = {P.O. Box 592, CH-1920 Martigny, Switzerland},
keywords = {confidence estimation, pattern matching, time-frequency analysis, Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {In this paper, we propose a new concept how tempo-social information about moments of togetherness within a social group of people can be retrieved in the palm of the hand from social context. The social context is digitised by audio logging of the same user centric device such as mobile phone. Being asynchronously driven it allows automatically logging social events with involved parties and thus helps to feel at home anywhere anytime and to nurture user to group relationships. The core of the algorithm is based on perceptual time-frequency analysis via confidence estimate of dynamic cepstral pattern matching between audio logs of people within a social group. The results show robust retrieval and surpass the performance of cross correlation while keeping lower system requirements.},
projects = {Idiap,
TA2},
}

@techreport{Farrahi_Idiap-RR-28-2010,
title = {Mining Human Location-Routines using a Multi-Level Topic Model},
author = {Farrahi, K.  and Gatica-Perez, D. },
year = {2010},
month = {August},
type = {Idiap-RR},
number = {Idiap-RR-28-2010},
institution = {Idiap},
keywords = {Report_IX, IM2.IP3, Group Bourlard, techreport},
projects = {Idiap,
SNSF-MULTI},
}

@techreport{Marcel_Idiap-RR-09-2010,
title = {Mobile Biometry (MOBIO) Face and Speaker Verification Evaluation},
author = {Marcel, S.  and McCool, C.  and Matejka, Pavel and Ahonen, Timo and Cernocky, Jan},
year = {2010},
month = {May},
type = {Idiap-RR},
number = {Idiap-RR-09-2010},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {This paper evaluates the performance of face and speaker verification techniques in the context of a mobile environment. The mobile environment was chosen as it provides a realistic and challenging test-bed for biometric person verification techniques to operate. For instance the audio environment is quite noisy and there is limited control over the illumination conditions and the pose of the subject for the video. To conduct this evaluation, a part of a database captured during the Mobile Biometry'' (MOBIO) European Project was used. In total there were nine participants to the evaluation who submitted a face verification system and five participants who submitted speaker verification systems. The nine face verification systems all varied significantly in terms of both verification algorithms and face detection algorithms. Several systems used the OpenCV face detector while the better systems used proprietary software for the task of face detection. This ended up making the evaluation of verification algorithms challenging. The five speaker verification systems were based on one of two paradigms: a Gaussian Mixture Model (GMM) or Support Vector Machine (SVM) paradigm. In general the systems based on the SVM paradigm performed better than those based on the GMM paradigm.},
projects = {MOBIO},
}

@techreport{Orabona_Idiap-RR-05-2009,
author = {Orabona, F.  and Castellini, C.  and Caputo, B.  and Fiorilla, A. E.  and Sandini, G. },
year = {2009},
month = {March},
type = {Idiap-RR},
number = {Idiap-RR-05-2009},
institution = {Idiap},
note = {Accepted in ICRA09},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {The state-of-the-art in control of hand prosthetics is far from optimal. The main control interface is represented by surface electromyography (EMG): the activation potentials of the remnants of large muscles of the stump are used in a non-natural way to control one or, at best, two degrees-of-freedom. This has two drawbacks: first, the dexterity of the prosthesis is limited, leading to poor interaction with the environment; second, the patient undergoes a long training time. As more dexterous hand prostheses are put on the market, the need for a finer and more natural control arises. Machine learning can be employed to this end. A desired feature is that of providing a pre-trained model to the patient, so that a quicker and better interaction can be obtained. To this end we propose model adaptation with least-squares SVMs, a technique that allows the automatic tuning of the degree of adaptation. We test the effectiveness of the approach on a database of EMG signals gathered from human subjects. We show that, when pre-trained models are used, the number of training samples needed to reach a certain performance is reduced, and the overall performance is increased, compared to what would be achieved by starting from scratch.},
projects = {Idiap,
DIRAC},
}

@techreport{Negoescu_Idiap-RR-19-2010,
title = {Modeling and Understanding Flickr Communities through Topic-based Analysis},
author = {Negoescu, R. -A.  and Gatica-Perez, D. },
year = {2010},
month = {July},
type = {Idiap-RR},
number = {Idiap-RR-19-2010},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
projects = {Idiap,
IM2,
SNSF-MULTI},
}

@article{Negoescu_IEEET-MM_2010,
title = {Modeling and Understanding Flickr Communities through Topic-based Analysis},
author = {Negoescu, R. -A.  and Gatica-Perez, D. },
journal = {IEEE Transactions on Multimedia},
year = {2010},
month = {August},
volume = {12},
number = {5},
pages = {399--416},
issn = {1520-9210},
doi = {10.1109/tmm.2010.2050649},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {With the increased presence of digital imaging devices there also came an explosion in the amount of multimedia content available online. Users have transformed from passive consumers of media into content creators and have started organizing themselves in and around online communities. Flickr has more than 30 million users and over 3 billion photos, and many of them are tagged and public. One very important aspect in Flickr is the ability of users to organize in self-managed communities called groups. This paper examines an unexplored problem, which is jointly analyzing Flickr groups and users. We show that although users and groups are conceptually different, in practice they can be represented in a similar way via a bag-of-tags derived from their photos, which is amenable for probabilistic topic modeling. We then propose a probabilistic topic model representation learned in an unsupervised manner that allows the discovery of similar users and groups beyond direct tag-based strategies and we demonstrate that higher-level information such as topics of interest are a viable alternative. On a dataset containing users of 10,000 Flickr groups and over 1 milion photos, we show how this common topic-based representation allows for a novel analysis of the groups-users Flickr ecosystem, which results into new insights about the structure of the entities in this social media source. We demonstrate novel practical applications of our topic-based representation, such as similarity-based exploration of entities, or single and multi-topic tag-based search, which address current limitations in the ways Flickr is used today.},
projects = {Idiap,
SNSF-MULTI}
}

@incollection{Gatica-Perez_ACADEMICPRESS_2009,
title = {Modeling interest in face-to-face conversations from multimodal nonverbal behavior},
author = {Gatica-Perez, D. },
booktitle = {In J.-P. Thiran, H. Bourlard, and F. Marques, (Eds.), Multimodal Signal Processing, Academic Press},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group Bourlard, incollection},
projects = {Idiap},
}

@article{Ba_PAMI_2010,
title = {Multi-Person Visual Focus of Attention from Head Pose and Meeting Contextual Cues},
author = {Ba, S.  and Odobez, J. -M. },
crossref = {ba-idiap-rr-08-47},
journal = {IEEE Trans. on Pattern Analysis and Machine Intelligence, accepted for publication, november 2009},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Bourlard, article},
projects = {Idiap,
AMIDA,
IM2},
}

@phdthesis{Pinto_THESIS_2010,
title = {Multilayer Perceptron Based Hierarchical Acoustic Modeling for Automatic Speech Recognition},
author = {Pinto, J. P. },
year = {2010},
school = {Ecole polytechnique fédérale de Lausanne},
note = {Thèse Ecole polytechnique fédérale de Lausanne EPFL, no 4649 (2010), Programme doctoral Génie électrique, Faculté des sciences et techniques de l'ingénieur STI, Institut de génie électrique et électronique IEL (Laboratoire de l'IDIAP LIDIAP). Dir.: Hervé Bourlard},
keywords = {Report_IX, IM2.IP1, Group Bourlard, phdthesis},
abstract = {In this thesis, we investigate a hierarchical approach for estimating the phonetic class-conditional probabilities using a multilayer perceptron (MLP) neural network. The architecture consists of two MLP classifiers in cascade. The first MLP is trained in the conventional way using standard acoustic features with a temporal context of around 90 ms. The second MLP is trained on the phonetic class-conditional probabilities (or posterior features) estimated by the first classifier, but with a relatively longer temporal context of around 150-250 ms. The hierarchical architecture is motivated towards exploiting the useful contextual information in the sequence of posterior features which includes the evolution of the probability values within a phoneme (sub-phonemic) and its transition to/from neighboring phonemes (sub-lexical). As the posterior features are sparse and simple, the second classifier is able to learn the contextual information spanning a context as long as 250 ms. Extensive experiments on the recognition of phonemes on read speech as well as conversational speech show that the hierarchical approach yields significantly higher recognition accuracies. Analysis of the second MLP classifier using Volterra series reveal that it has learned the phonetic-temporal patterns in the posterior feature space which captures the confusions in phoneme classification at the output of the first classifier as well as the phonotactics of the language as observed in the training data. Furthermore, we show that the second MLP can be simple in terms of the number of model parameters and that it can be trained on lesser training data. The usefulness of the proposed hierarchical acoustic modeling in automatic speech recognition (ASR) is demonstrated using two applications (a) task adaptation where the goal is to exploit MLPs trained on large amount of data and available off-the-shelf to new tasks and (b) large vocabulary continuous ASR on broadcast news and broadcast conversations in Mandarin. Small vocabulary isolated word recognition and task adaptation studies are performed on the Phonebook database and the large vocabulary speech recognition studies are performed on the DARPA GALE Mandarin database.},
projects = {Idiap,
SNSF-KEYSPOT},
}

@techreport{Korchagin_Idiap-Com-01-2009,
title = {Multimodal Data Flow Controller},
author = {Korchagin, D. },
year = {2009},
month = {November},
type = {Idiap-Com},
number = {Idiap-Com-01-2009},
institution = {Idiap},
address = {P.O. Box 592, CH-1920 Martigny, Switzerland},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {In this paper, we describe a multimodal data flow controller capable of reading most multichannel sound cards and web cameras, synchronising media streams, being a server to stream captured media over TCP in raw format, being a client to receive media streams over TCP in raw format and using unified interface for online transmission.},
projects = {Idiap,
TA2},
}

@book{Thiran_ACADEMICPRESS_2009,
title = {Multimodal Signal Processing: Methods and Techniques to Build Multimodal Interactive Systems},
author = {Thiran, J. -Ph.  and Bourlard, H.  and Marques, F. },
year = {2009},
isbn = {0-1237-4825-9},
keywords = {Report_IX, IM2.IP1, Group Bourlard, book},
abstract = {Multimodal signal processing is an important new field that processes signals from a variety of modalities - speech, vision, language, text- derived from one source, which aids human-computer and human-human interaction. The overarching theme of this book is the application of signal processing and statistical machine learning techniques to problems arising in this field. It gives an overview of the field, the capabilities and limitations of current technology, and the technical challenges that must be overcome to realize multimodal interactive systems. As well as state-of-the-art methods in multimodal signal and image modeling and processing, the book gives numerous examples and applications of multimodal interactive systems, including humancomputer and human-human interaction. This is the definitive reference in multimodal signal processing, edited and contributed by the leading experts, for signal processing researchers and graduates, R{\&}D engineers and computer engineers.},
projects = {Idiap}
}

@inproceedings{Vijayasenan_ICASSP2010_2010,
title = {Multistream Speaker Diarization beyond Two Acoustic Feature Streams},
author = {Vijayasenan, D.  and Valente, F.  and Bourlard, H. },
crossref = {diarmulti4feat},
booktitle = {International Conference on Acoustics, Speech, and Signal Processing},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Speaker diarization for meetings data are recently converging towards multistream systems. The most common complementary features used in combination with MFCC are Time Delay of Arrival (TDOA). Also other features have been proposed although, there are no reported improvements on top of MFCC TDOA systems. In this work we investigate the combination of other feature sets along with MFCC TDOA. We discuss issues and problems related to the weighting of four different streams proposing a solution based on a smoothed version of the speaker error. Experiments are presented on NIST RT06 meeting diarization evaluation. Results reveal that the combination of four acoustic feature streams results in a 30% relative improvement with respect to the MFCC TDOA feature combination. To the authors{\^a}�� best knowledge, this is the first successful attempt to improve the MFCC TDOA baseline including other feature streams.},
projects = {Idiap,
AMIDA,
IM2},
}

@inproceedings{Do_AISTATS_2010,
title = {Neural conditional random fields},
author = {Do, Trinh-Minh-Tri and Artieres, Thierry},
booktitle = {Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics},
year = {2010},
month = {May},
volume = {9},
pages = {177--184},
publisher = {JMLR: W{\&}CP},
location = {Chia Laguna, Sardinia, Italy},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {We propose a non-linear graphical model for structured prediction. It combines the power of deep neural networks to extract high level features with the graphical framework of Markov networks, yielding a powerful and scalable probabilistic model that we apply to signal labeling tasks.},
projects = {Idiap},
}

@techreport{Luo_Idiap-RR-06-2010,
title = {OM-2: An Online Multi-class Multi-kernel Learning Algorithm},
author = {Luo, J.  and Orabona, F.  and Fornoni, Marco and Caputo, B.  and Cesa-Bianchi, Nicolo},
year = {2010},
month = {April},
type = {Idiap-RR},
number = {Idiap-RR-06-2010},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {Efficient learning from massive amounts of information is a hot topic in computer vision. Available training sets contain many examples with several visual descriptors, a setting in which current batch approaches are typically slow and does not scale well. In this work we introduce a theo- retically motivated and efficient online learning algorithm for the Multi Kernel Learning (MKL) problem. For this algorithm we prove a theoretical bound on the number of multiclass mistakes made on any arbitrary data sequence. Moreover, we empirically show that its performance is on par, or better, than standard batch MKL (e.g. SILP, Sim- pleMKL) algorithms.},
}

@techreport{Soldo_Idiap-RR-37-2009,
title = {On MLP-based Posterior Features for Template-based ASR},
author = {Soldo, Serena and Magimai-Doss, M.  and Pinto, J. P.  and Bourlard, H. },
year = {2009},
month = {December},
type = {Idiap-RR},
number = {Idiap-RR-37-2009},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {We investigate the invariance of posterior features estimated using MLP trained on auxiliary corpus towards different data condition and different distance measures for matching posterior features in the context of template-based ASR. Through ASR studies on isolated word recognition task we show that posterior features estimated using MLP trained on auxiliary corpus with out any kind of adaptation can achieve comparable or better performance when compared to the case where the MLP is trained on the corpus same as that of the test set. We also show that local scores, weighted symmetric KL-divergence and Bhattacharya distance yield better systems compared to Hellinger distance, cosine angle, L1-norm, L2-norm, dot product, and cross entropy.},
projects = {Idiap},
}

@techreport{Marcel_Idiap-RR-30-2010,
title = {On the Results of the First Mobile Biometry (MOBIO) Face and Speaker Verification Evaluation},
author = {Marcel, S.  and McCool, C.  and Matejka, Pavel and Ahonen, Timo and Cernocky, Jan and al },
year = {2010},
month = {August},
type = {Idiap-RR},
number = {Idiap-RR-30-2010},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {This paper evaluates the performance of face and speaker verification techniques in the context of a mobile environment. The mobile environment was chosen as it provides a realistic and challenging test-bed for biometric person verification techniques to operate. For instance the audio environment is quite noisy and there is limited control over the illumination conditions and the pose of the subject for the video. To conduct this evaluation, a part of a database captured during the {\^a}��Mobile Biometry{\^a}�� (MOBIO) European Project was used. In total there were nine participants to the evaluation who submitted a face verification system and five participants who submitted speaker verification systems.},
projects = {MOBIO},
}

@article{Galbally_PR_2009,
title = {On the vulnerability of face verification systems to hill-climbing attacks},
author = {Galbally, J.  and McCool, C.  and Fierrez, J.  and Marcel, S.  and Ortega-Garcia, J. },
journal = {Pattern Recognition},
year = {2009},
keywords = {Report_IX, IM2.IP1, Group Bourlard, article},
abstract = {In this paper, we use a hill-climbing attack algorithm based on Bayesian adaption to test the vulnerability of two face recognition systems to indirect attacks. The attacking technique uses the scores provided by the matcher to adapt a global distribution computed from an independent set of users, to the local specificities of the client being attacked. The proposed attack is evaluated on an eigenface-based and a parts-based face verification system using the XM2VTS database. Experimental results demonstrate that the hill-climbing algorithm is very efficient and is able to bypass over 85% of the attacked accounts (for both face recognition systems). The security flaws of the analyzed systems are pointed out and possible countermeasures to avoid them are also proposed.},
projects = {Idiap,
MOBIO}
}

@inproceedings{Orabona_CVPR_2010,
title = {Online-Batch Strongly Convex Multi Kernel Learning},
author = {Orabona, F.  and Luo, J.  and Caputo, B. },
booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
year = {2010},
month = {June},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Several object categorization algorithms use kernel methods over multiple cues, as they offer a principled ap- proach to combine multiple cues, and to obtain state-of-the- art performance. A general drawback of these strategies is the high computational cost during training, that prevents their application to large-scale problems. They also do not provide theoretical guarantees on their convergence rate. Here we present a Multiclass Multi Kernel Learning (MKL) algorithm that obtains state-of-the-art performance in a considerably lower training time. We generalize the standard MKL formulation to introduce a parameter that al- lows us to decide the level of sparsity of the solution. Thanks to this new setting, we can directly solve the problem in the primal formulation. We prove theoretically and experimen- tally that 1) our algorithm has a faster convergence rate as the number of kernels grow; 2) the training complexity is linear in the number of training examples; 3) very few iter- ations are enough to reach good solutions. Experiments on three standard benchmark databases support our claims.},
projects = {Idiap,
DIRAC},
}

@inproceedings{Korchagin_AC_2009,
title = {Out-of-Scene AV Data Detection},
author = {Korchagin, D. },
crossref = {Korchagin_Idiap-RR-31-2009},
booktitle = {Proceedings IADIS International Conference Applied Computing},
year = {2009},
month = {November},
volume = {2},
pages = {244--248},
location = {Rome, Italy},
address = {P.O. Box 592, CH-1920 Martigny, Switzerland},
isbn = {978-972-8924-97-3},
keywords = {confidence estimation, pattern matching, time-frequency analysis, Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {In this paper, we propose a new approach for the automatic audio-based out-of-scene detection of audio-visual data, recorded by different cameras, camcorders or mobile phones during social events. All recorded data is clustered to out-of-scene and in-scene datasets based on confidence estimation of cepstral pattern matching with a common master track of the event, recorded by a reference camera. The core of the algorithm is based on perceptual time-frequency analysis and confidence measure based on distance distribution variance. The results show correct clustering in 100% of cases for a real life dataset and surpass the performance of cross correlation while keeping lower system requirements.},
projects = {Idiap,
TA2},
}

@inproceedings{Sanchez-Cortes_ICMI-MLMI_2009,
title = {Predicting Remote Versus Collocated Group Interactions using Nonverbal Cues},
author = {Sanchez-Cortes, Dairazalia and Jayagopi, D.  and Gatica-Perez, D. },
booktitle = {Proc. Int. Conf. on Multimodal Interfaces, Workshop on Multimodal Sensor-Based Systems and Mobile Phones for Social Computing,},
year = {2009},
month = {November},
location = {Cambridge},
isbn = {978-1-60558-694-6},
doi = {10.1145/1641389.1641392},
keywords = {Characterizing small groups, Nonverbal behavior, Remote meetings, Report_IX, IM2.IP3, Group Bourlard, inproceedings},
abstract = {This paper addresses two problems: Firstly, the problem of classifying remote and collocated small-group working meet- ings, and secondly, the problem of identifying the remote participant, using in both cases nonverbal behavioral cues. Such classifiers can be used to improve the design of remote collaboration technologies to make remote interactions as ef- fective as possible to collocated interactions. We hypothesize that the difference in the dynamics between collocated and remote meetings is significant and measurable using speech activity based nonverbal cues. Our results on a publicly available dataset - the Augmented Multi-Party Interaction with Distance Access (AMIDA) corpus - show that such an approach is promising, although more controlled settings and more data are needed to explore the addressed prob- lems further.},
projects = {Idiap,
AMIDA,
IM2}
}

@inproceedings{Varadarajan_BMVC2010_2010,
title = {Probabilistic Latent Sequential Motifs: Discovering temporal activity patterns in video scenes},
author = {Varadarajan, Jagannadan and Emonet, Remi and Odobez, J. -M. },
booktitle = {BMVC 2010},
year = {2010},
month = {September},
pages = {117.1--117.11},
publisher = {BMVA Press},
organization = {Aberystwyth University},
location = {Aberystwyth},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {This paper introduces a novel probabilistic activity modeling approach that mines recurrent sequential patterns from documents given as word-time occurrences. In this model, documents are represented as a mixture of sequential activity motifs (or topics) and their starting occurrences. The novelties are threefold. First, unlike previous approaches where topics only modeled the co-occurrence of words at a given time instant, our topics model the co-occurrence and temporal order in which the words occur within a temporal window. Second, our model accounts for the important case where activities occur concurrently in the document. And third, our method explicitly models with latent variables the starting time of the activities within the documents, enabling to implicitly align the occurrences of the same pattern during the joint inference of the temporal topics and their starting times. The model and its robustness to the presence of noise have been validated on synthetic data. Its effectiveness is also illustrated in video activity analysis from low-level motion features, where the discovered topics capture frequent patterns that implicitly represent typical trajectories of scene objects.},
projects = {SNSF-MULTI},
}

@article{Farrahi_IEEEJ-STSP_2010,
title = {Probabilistic Mining of Socio-Geographic Routines from Mobile Phone Data},
author = {Farrahi, K.  and Gatica-Perez, D. },
journal = {IEEE JOURNAL OF SELECTED TOPICS IN SIGNAL PROCESSING},
year = {2010},
month = {August},
volume = {4},
number = {4},
pages = {746--755},
keywords = {Report_IX, IM2.IP3, Group Bourlard, article},
abstract = {There is relatively little work on the investigation of large-scale human data in terms of multimodality for human activity discovery. In this paper we suggest that human interaction data, or human proximity, obtained by mobile phone Bluetooth sensor data, can be integrated with human location data, obtained by mobile cell tower connections, to mine meaningful details about human activities from large and noisy datasets. We propose a model, called bag of multimodal behavior, that integrates the modeling of variations of location over multiple time-scales, and the modeling of interaction types from proximity. Our representation is simple yet robust to characterize real-life human behavior sensed from mobile phones, which are devices capable of capturing large-scale data known to be noisy and incomplete. We use an unsupervised approach, based on probabilistic topic models, to discover latent human activities in terms of the joint interaction and location behaviors of 97 individuals over the course of approximately a 10 month period using data from MIT's Reality Mining project. Some of the human activities discovered with our multimodal data representation include going out from 7pm-midnight alone" and working from 11am-5pm with 3-5 other people", further finding that this activity dominantly occurs on specific days of the week. Our methodology also finds dominant work patterns occurring on other days of the week. We further demonstrate the feasibility of the topic modeling framework to discover human routines to predict missing multimodal phone data on specific times of the day.},
projects = {Idiap,
SNSF-MULTI},
}

@inproceedings{Roman-Rangel_ICCV_2009,
title = {Retrieving Ancient Maya Glyphs with Shape Context},
author = {Roman-Rangel, Edgar and Pallan, Carlos and Odobez, J. -M.  and Gatica-Perez, D. },
booktitle = {2009 IEEE 12th International Conference on Computer Vision Workshops, ICCV Workshops},
year = {2009},
month = {October},
publisher = {IEEE},
location = {Kyoto, Japan},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {We introduce an interdisciplinary project for archaeological and computer vision research teams on the analysis of the ancient Maya writing system. Our first task is the automatic retrieval of Maya syllabic glyphs using the Shape Context descriptor. We investigated the effect of several parameters to adapt the shape descriptor given the high complexity of the shapes and their diversity in our data. We propose an improvement in the cost function used to compute similarity between shapes making it more restrictive and precise. Our results are promising, they are analyzed via standard image retrieval measurements.},
projects = {Idiap},
}

@inproceedings{Favre_ACII_2009,
title = {Social Network Analysis in Multimedia Indexing: Making Sense of People in Multiparty Recordings},
author = {Favre, S. },
booktitle = {Proceedings of the Doctoral Consortium of the International Conference on Affective Computing {\&} Intelligent Interaction (ACII)},
year = {2009},
pages = {25--32},
keywords = {Report_IX, IM2.IP3, Group Bourlard, inproceedings},
abstract = {This paper presents an automatic approach to analyze the human interactions appearing in multiparty data, aiming at understanding the data content and at extracting social informa- tion such as Which role do people play?, What is their attitude?, or Can people be split into meaningful groups?. To extract such information, we use a set of mathematical techniques, namely Social Networks Analysis (SNA), developed by sociologists to analyze social interac- tions. This paper shows that a strong connection can be established between the content of broadcast data and the social interactions of the individuals involved in the recordings. Experiments aiming at assigning each individual to a social group corresponding to a specific topic in broadcast news, and experiments aiming at recognizing the role played by each indi- vidual in multiparty data are presented in this paper. The results achieved are satisfactory, which suggests on one side that the application of SNA to similar problems could lead to useful contributions in the domain of multimedia content analysis, and on the other side, that the presented analysis of social interactions could be a significant breakthrough for affective computing.},
projects = {Idiap,
SNSF-MULTI,
SSPNet},
}

@inproceedings{Asaei_INTERSPEECH_2010,
title = {Sparse Component Analysis for Speech Recognition in Multi-Speaker Environment},
author = {Asaei, Afsaneh and Bourlard, H.  and Garner, P. N. },
booktitle = {Proceedings of Interspeech},
year = {2010},
month = {September},
location = {Makuhari, Japan},
keywords = {Automatic Speech Recognition, Overlapping Speech, Sparse Component Analysis, Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Sparse Component Analysis is a relatively young technique that relies upon a representation of signal occupying only a small part of a larger space. Mixtures of sparse components are disjoint in that space. As a particular application of sparsity of speech signals, we investigate the DUET blind source separation algorithm in the context of speech recognition for multi-party recordings. We show how DUET can be tuned to the particular case of speech recognition with interfering sources, and evaluate the limits of performance as the number of sources increases. We show that the separated speech fits a common metric for sparsity, and conclude that sparsity assumptions lead to good performance in speech separation and hence ought to benefit other aspects of the speech recognition chain.},
projects = {Idiap,
IM2},
}

@inproceedings{Wester_SSW_2010,
title = {Speaker adaptation and the evaluation of speaker similarity in the EMIME speech-to-speech translation project},
author = {Wester, Mirjam and Dines, J.  and Gibson, Matthew and Liang, H.  and Wu, Yi-Jian and Saheer, L.  and King, S.  and Oura, Keiichiro and Garner, P. N.  and Byrne, William and Guan, Yong and Hirsim{üa}ki, Teemu and Karhila, Reima and Kurimo, Mikko and Shannon, Matt and Shiota, Sayaka and Tian, Jilei and Tokuda, Keiichi and Yamagishi, J. },
booktitle = {Proceedings of the 7th ISCA Speech Synthesis Workshop},
year = {2010},
month = {September},
location = {Kyoto, Japan},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {This paper provides an overview of speaker adaptation research carried out in the EMIME speech-to-speech translation (S2ST) project. We focus on how speaker adaptation transforms can be learned from speech in one language and applied to the acous- tic models of another language. The adaptation is transferred across languages and/or from recognition models to synthesis models. The various approaches investigated can all be viewed as a process in which a mapping is defined in terms of either acoustic model states or linguistic units. The mapping is used to transfer either speech data or adaptation transforms between the two models. Because the success of speaker adaptation in text-to-speech synthesis is measured by judging speaker simi- larity, we also discuss issues concerning evaluation of speaker similarity in an S2ST scenario.},
projects = {EMIME},
}

@techreport{Saheer_Idiap-RR-25-2010,
title = {Study of Jacobian Normalization for VTLN},
author = {Saheer, L.  and Garner, P. N.  and Dines, J. },
year = {2010},
month = {July},
type = {Idiap-RR},
number = {Idiap-RR-25-2010},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {The divergence of the theory and practice of vocal tract length normalization (VTLN) is addressed, with particular emphasis on the role of the Jacobian determinant. VTLN is placed in a Bayesian setting, which brings in the concept of a prior on the warping factor. The form of the prior, together with acoustic scaling and numerical conditioning are then discussed and evaluated. It is concluded that the Jacobian determinant is important in VTLN, especially for the high dimensional features used in HMM based speech synthesis, and difficulties normally associated with the Jacobian determinant can be attributed to prior and scaling.},
projects = {Idiap},
}

@inproceedings{Hain_INTERSPEECH_2010,
title = {The AMIDA 2009 Meeting Transcription System},
author = {Hain, T.  and Burget, Lukas and Dines, J.  and Garner, P. N.  and El Hannani, A.  and Huijbregts, M.  and Karafiat, M.  and Lincoln, M.  and Wan, V. },
booktitle = {Proceedings of Interspeech},
year = {2010},
month = {September},
location = {Makuhari, Japan},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {We present the AMIDA 2009 system for participation in the NIST RT{\^a}��2009 STT evaluations. Systems for close-talking, far field and speaker attributed STT conditions are described. Im- provements to our previous systems are: segmentation and diar- isation; stacked bottle-neck posterior feature extraction; fMPE training of acoustic models; adaptation on complete meetings; improvements to WFST decoding; automatic optimisation of decoders and system graphs. Overall these changes gave a 6- 13% relative reduction in word error rate while at the same time reducing the real-time factor by a factor of five and using con- siderably less data for acoustic model training.},
projects = {AMIDA},
}

@article{Estrella_LINGUISTICAANTVERPIENSIA_2009,
title = {The FEMTI guidelines for contextual MT evaluation: principles and tools},
author = {Estrella, P.  and Popescu-Belis, A.  and King, M. },
journal = {Linguistica Antverpiensia New Series},
booktitle = {Evaluation of Translation Technology},
year = {2009},
volume = {8},
keywords = {Report_IX, IM2.IP1, Group Bourlard, article},
projects = {Idiap,
IM2}
}

@article{Pronobis_IMAVIS_2010,
title = {The More you Learn, the Less you Store: Memory-controlled Incremental SVM for Visual Place Recognition},
author = {Pronobis, A.  and Luo, J.  and Caputo, Barbara},
journal = {Image and Vision Computing},
year = {2010},
month = {February},
doi = {10.1016/j.imavis.2010.01.015},
keywords = {Report_IX, IM2.IP1, Group Bourlard, article},
abstract = {The capability to learn from experience is a key property for autonomous cognitive systems working in realistic settings. To this end, this paper presents an SVM-based algorithm, capable of learning model representations incrementally while keeping under control memory requirements. We combine an incremental extension of SVMs with a method reducing the number of support vectors needed to build the decision function without any loss in performance introducing a parameter which permits a user-set trade-off between performance and memory. The resulting algorithm is able to achieve the same recognition results as the original incremental method while reducing the memory growth. Our method is especially suited to work for autonomous systems in realistic settings. We present experiments on two common scenarios in this domain: adaptation in presence of dynamic changes and transfer of knowledge between two different autonomous agents, focusing in both cases on the problem of visual place recognition applied to mobile robot topological localization. Experiments in both scenarios clearly show the power of our approach.},
projects = {Idiap,
DIRAC},
}

@inproceedings{Hung_ACMMM2010_2010,
title = {The Wolf Corpus: Exploring group behaviour in a competitive role-playing game},
author = {Hung, H.  and Chittaranjan, Gokul},
booktitle = {ACM Multimedia},
year = {2010},
month = {October},
keywords = {corpus, deception, human behaviour, multi-party, Report_IX, IM2.IP3, Group Bourlard, inproceedings},
abstract = {In this paper we present the Idiap Wolf Database. This is a audio-visual corpus containing natural conversational data of volunteers who took part in a competitive role-playing game. Four groups of 8-12 people were recorded. In total, just over 7 hours of interactive conversational data was col- lected. The data has been annotated in terms of the roles and outcomes of the game. There are 371 examples of dif- ferent roles played over 50 games. Recordings were made with headset microphones, an 8-microphone array, and 3 video cameras and are fully synchronised. The novelty of this data is that some players have deceptive roles and the participants do not know what roles other people play.},
projects = {Idiap,
AMIDA},
}

@inproceedings{Varadarajan_INTERNATIONALCONFERENCEINCOMPUTERVISION_2009,
title = {Topic Models for Scene Analysis and Abnormality Detection},
booktitle = {9th International Workshop in Visual Surveillance},
year = {2009},
month = {October},
publisher = {IEEE},
organization = {IEEE},
location = {Kyoto, Japan},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
projects = {Idiap,
SNSF-MULTI},
}

@inproceedings{Bunt_LREC2010_2010,
title = {Towards a standard for dialogue act annotation},
author = {Bunt, Harry and Alexandersson, Jan and Carletta, J.  and Choe, Jae-Woong and Fang, Alex and Hasida, Koiti and Lee, Kiyong and Petukhova, Volha and Popescu-Belis, A.  and Romary, Laurent and Soria, Claudia and David, Traum. },
booktitle = {7th International Conference on Language Resources and Evaluation},
year = {2010},
month = {May},
location = {Malta},
url = {http://www.lrec-conf.org/proceedings/lrec2010/summaries/560.html},
keywords = {dialogue, semantics, Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {This paper describes an ISO project which aims at developing a standard for annotating spoken and multimodal dialogue with semantic information concerning the communicative functions of utterances, the kind of semantic content they address, and their relations with what was said and done earlier in the dialogue. The project, ISO 24617-2 "Semantic annotation framework, Part 2: Dialogue acts", is currently at DIS stage. The proposed annotation schema distinguishes 9 orthogonal dimensions, allowing each functional segment in dialogue to have a function in each of these dimensions, thus accounting for the multifunctionality that utterances in dialogue often have. A number of core communicative functions is defined in the form of ISO data categories, available at http://semantic-annotation.uvt.nl/dialogue-acts/iso-datcats.pdf; they are divided into "dimension-specific" functions, which can be used only in a particular dimension, such as Turn Accept in the Turn Management dimension, and "general-purpose" functions, which can be used in any dimension, such as Inform and Request. An XML-based annotation language, "DiAML" is defined, with an abstract syntax, a semantics, and a concrete syntax.},
projects = {Idiap,
IM2},
}

@inproceedings{Imseng_INTERSPEECH-2_2010,
title = {Towards mixed language speech recognition systems},
author = {Imseng, D.  and Bourlard, H.  and Magimai-Doss, M. },
crossref = {Imseng_Idiap-RR-15-2010},
booktitle = {Proceedings of Interspeech},
year = {2010},
month = {September},
location = {Makuhari, Japan},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Multilingual speech recognition obviously involves numerous research challenges, including common phoneme sets, adaptation on limited amount of training data, as well as mixed language recognition (common in many countries, like Switzerland). In this latter case, it is not even possible to assume that one knows in advance the language being spoken. This is the context and motivation of the present work. We indeed investigate how current state-of-the-art speech recognition systems can be exploited in multilingual environments, where the language (from an assumed set of five possible languages, in our case) is not a priori known during recognition. We combine monolingual systems and extensively develop and compare different features and acoustic models. On SpeechDat(II) datasets, and in the context of isolated words, we show that it is actually possible to approach the performances of monolingual systems even if the identity of the spoken language is not a priori known.},
projects = {Idiap,
SNSF-MULTI,
IM2},
}

@inproceedings{Kiukkonen_ICPS,
title = {Towards rich mobile phone datasets: Lausanne data collection campaign},
booktitle = {Proc. ACM Int. Conf. on Pervasive Services (ICPS), Berlin.},
year = {2010},
month = {July},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
author = {Kiukkonen,
N. and J.,
Blom and Dousse,
O. and Gatica-Perez,
Daniel and J.,
Laurila},
projects = {Idiap},
}

@inproceedings{Garner_INTERSPEECH_2010,
title = {Tracter: A Lightweight Dataflow Framework},
author = {Garner, P. N.  and Dines, J. },
crossref = {Garner_Idiap-RR-10-2010},
booktitle = {Proceedings of Interspeech},
year = {2010},
month = {September},
location = {Makuhari, Japan},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Tracter is introduced as a dataflow framework particularly useful for speech recognition. It is designed to work on-line in real-time as well as off-line, and is the feature extraction means for the Juicer transducer based decoder. This paper places Tracter in context amongst the dataflow literature and other commercial and open source packages. Some design aspects and capabilities are discussed. Finally, a fairly large processing graph incorporating voice activity detection and feature extraction is presented as an example of Tracter's capabilites.},
projects = {AMIDA,
IM2,
TA2},
}

@techreport{Popescu-Belis_Idiap-RR-38-2009,
title = {User Interface Design in a Just-in-time Retrieval System for Meetings},
author = {Popescu-Belis, A.  and Poller, P.  and Kilgour, J.  and Flynn, M.  and Germesin, Sebastian and Nanchen, A.  and Yazdani, M. },
year = {2009},
month = {December},
type = {Idiap-RR},
number = {Idiap-RR-38-2009},
institution = {Idiap},
keywords = {Report_IX, IM2.IP1, Group Bourlard, techreport},
abstract = {The Automatic Content Linking Device (ACLD) is a just-in-time multimedia retrieval system that monitors and supports the conversation among a small group of people within a meeting. The ACLD retrieves from a repository, at regular intervals, information that might be relevant to the group's activity, and presents it through a graphical user interface (GUI). The repository contains documents from past meetings such as slides or reports along with processed meeting recordings; in parallel, Web searches are run as well. The acceptance by users of such a system depends considerably on the GUI, along with the performance of retrieval. The trade-off between informativeness and unobtrusiveness is studied here through the design of a series of GUIs. The requirements and feedback collected while demonstrating the successive versions show that users vary considerably in their preferences for a given style of interface. After studying two extreme options, a widget vs. a wide-screen UI, we conclude that a modular UI, which can be flexibly structured and resized by users, is the most sensible design for a just-in-time multimedia retrieval system.},
projects = {Idiap,
AMIDA,
IM2},
}

@inproceedings{Garau_ICASSP2010_2010,
title = {Using Audio and Visual Cues for Speaker Diarisation Initialisation},
author = {Garau, G.  and Bourlard, H. },
booktitle = {International Conference on Acoustics, Speech and Signal Processing},
year = {2010},
month = {March},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
projects = {Idiap,
AMIDA,
IM2},
}

@inproceedings{Saheer_ICASSP_2010,
title = {VTLN Adaptation for Statistical Speech Synthesis},
author = {Saheer, L.  and Garner, P. N.  and Dines, J.  and Liang, H. },
booktitle = {Proceedings of ICASSP},
year = {2010},
location = {Dallas, Texas},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {The advent of statistical speech synthesis has enabled the unification of the basic techniques used in speech synthesis and recognition. Adaptation techniques that have been successfully used in recognition systems can now be applied to synthesis systems to improve the quality of the synthesized speech. The application of vocal tract length normalization (VTLN) for synthesis is explored in this paper. VTLN based adaptation requires estimation of a single warping factor, which can be accurately estimated from very little adaptation data and gives additive improvements over CMLLR adaptation. The challenge of estimating accurate warping factors using higher order features is solved by initializing warping factor estimation with the values calculated from lower order features.},
projects = {Idiap,
EMIME},
}

@incollection{Gatica-Perez_HANDBOOKOFAMBIENTINTELLIGENCEANDSMARTENVIRONMENTS_2010,
title = {Visual Attention, Speaking Activity, and Group Conversational Analysis in Multi-Sensor Environments},
author = {Gatica-Perez, D.  and Odobez, J. -M. },
booktitle = {In H. Nakashima, J. Augusto, H. Aghajan (Eds.), Handbook of Ambient Intelligence and Smart Environments},
year = {2010},
publisher = {Springer},
keywords = {Report_IX, IM2.IP1, Group Bourlard, incollection},
projects = {Idiap}
}

@inproceedings{Roy_ACMSAC2010_2010,
title = {Visual processing-inspired Fern-Audio features for Noise-Robust Speaker Verification},
author = {Roy, A.  and Marcel, S. },
crossref = {Roy_Idiap-RR-29-2009},
booktitle = {ACM 25th Symposium on Applied Computing, 2010, Sierre, Switzerland},
year = {2010},
month = {March},
organization = {Association for Computing Machinery},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {In this paper, we consider the problem of speaker verification as a two-class object detection problem in computer vision, where the object instances are 1-D short-time spectral vectors obtained from the speech signal. More precisely, we investigate the general problem of speaker verification in the presence of additive white Gaussian noise, which we consider as analogous to visual object detection under varying illumination conditions. Inspired by their recent success in illumination-robust object detection, we apply a certain class of binary-valued pixel-pair based features called Ferns for noise-robust speaker verification. Intensive experiments on a benchmark database according to a standard evaluation protocol have shown the advantage of the proposed features in the presence of moderate to extremely high amounts of additive noise.},
projects = {Idiap,
MOBIO,
SNSF-MULTI},
}

@inproceedings{Biel_ICWSM,
title = {Voices of Vlogging},
booktitle = {Proc. AAAI Int. Conf. on Weblogs and Social Media (ICWSM), Washington DC},
year = {2010},
month = {May},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Vlogs have rapidly evolved from the {\^a}��chat from your bedroom{\^a}�� format to a highly creative form of expression and communication. However, despite the high popularity of vlogging, automatic analysis of conversational vlogs have not been attempted in the literature. In this paper, we present a novel analysis of conversational vlogs based on the characterization of vloggers{\^a}�� nonverbal behavior. We investigate the use of four nonverbal cues extracted automatically from the audio channel to measure the behavior of vloggers and explore the relation to their degree of popularity and that of their videos. Our study is validated on over 2200 videos and 150 hours of data, and shows that one nonverbal cue (speaking time) is correlated with levels of popularity with a medium size effect.},
author = {Biel,
Joan-Isaac and Gatica-Perez,
Daniel},
projects = {Idiap,
IM2},
}

@inproceedings{Biel_ACMMM09_2009,
title = {Wearing a YouTube hat: directors, comedians, gurus, and user aggregated behavior},
author = {Biel, Joan-Isaac and Gatica-Perez, D. },
booktitle = {Proceedings of the 17th ACM International Conference on Multimedia},
year = {2009},
month = {October},
pages = {833--836},
publisher = {ACM},
keywords = {social networks, user aggregated behavior, video-sharing, YouTubeYouTube, Report_IX, IM2.IP3, Group Bourlard, inproceedings},
abstract = {While existing studies on YouTube's massive user-generated video content have mostly focused on the analysis of videos, their characteristics, and network properties, little attention has been paid to the analysis of users' long-term behavior as it relates to the roles they self-define and (explicitly or not) play in the site. In this paper, we present a novel statistical analysis of aggregated user behavior in YouTube from the novel perspective of user categories, a feature that allows people to ascribe to popular roles and to potentially reach certain communities. Using a sample of 270,000 users, we found that a high level of interaction and participation is concentrated on a relatively small, yet significant, group of users, following recognizable patterns of personal and social involvement. Based on our analysis, we also show that by using simple behavioral features from user profiles, people can be automatically classified according to their category with accuracy rates of up to 73%.},
projects = {Idiap,
IM2},
}

@inproceedings{Jie_NIPS2009,
title = {Who's Doing What: Joint Modeling of Names and Verbs for Simultaneous Face and Pose Annotation},
author = {Luo, J.  and Caputo, B.  and Ferrari, V. },
booktitle = {Advances in Neural Information Processing Systems 22 (NIPS09)},
year = {2009},
month = {December},
publisher = {MIT Press},
organization = {NIPS Foundation},
keywords = {Report_IX, IM2.IP1, Group Bourlard, inproceedings},
abstract = {Given a corpus of news items consisting of images accompanied by text captions, we want to find out {\^a}��who{\^a}��s doing what{\^a}��, i.e. associate names and action verbs in the captions to the face and body pose of the persons in the images. We present a joint model for simultaneously solving the image-caption correspondences and learning visual appearance models for the face and pose classes occurring in the corpus. These models can then be used to recognize people and actions in novel images without captions. We demonstrate experimentally that our joint {\^a}��face and pose{\^a}�� model solves the correspondence problem better than earlier models covering only the face, and that it can perform recognition of new uncaptioned images.},
projects = {Idiap,
DIRAC},
}

@article{Motlicek_EURASIP-2_2009,
title = {Wide-Band Audio Coding based on Frequency Domain Linear Prediction},
author = {Motlicek, P.  and Ganapathy, S.  and Hermansky, H.  and Garudadri, H. },
editor = {Raj, Bhiksha},
crossref = {Motlicek_Idiap-RR-32-2009},
journal = {EURASIP Journal on Audio Speech and Music Processing},
year = {2010},
month = {February},
volume = {2010},
number = {856280},
pages = {14},
note = {Special Issue: Scalable Audio-Content Analysis},
url = {http://www.hindawi.com/journals/asmp/2010/856280.html},
doi = {10.1155/2010/856280},
keywords = {Report_IX, IM2.IP1, Group Bourlard, article},
abstract = {We revisit an original concept of speech coding in which the signal is separated into the carrier modulated by the signal envelope. A recently developed technique, called frequency-domain linear prediction (FDLP), is applied for the efficient estimation of the envelope. The processing in the temporal domain allows for a straightforward emulation of the forward temporal masking. This, combined with an efficient nonuniform sub-band decomposition and application of noise shaping in spectral domain instead of temporal domain (a technique to suppress artifacts in tonal audio signals), yields a codec that does not rely on the linear speech production model but rather uses well-accepted concept of frequency-selective auditory perception. As such, the codec is not only specific for coding speech but also well suited for coding other important acoustic signals such as music and mixed content. The quality of the proposed codec at 66{\^a}��kbps is evaluated using objective and subjective quality assessments. The evaluation indicates competitive performance with the MPEG codecs operating at similar bit rates.},
projects = {Idiap},
}

@inproceedings{Bogdan_ICASSP,
title = {YOU ARE FIRED! NONVERBAL ROLE ANALYSIS IN COMPETITIVE MEETINGS},
booktitle = {Proc. IEEE Int. Conf. on Acoustics, Speech, and Signal Processing (ICASSP), Taiwan.},
year = {2009},
month = {April},
keywords = {Report_IX, IM2.IP3, Group Bourlard, inproceedings},
abstract = {This paper addresses the problem of social interaction analysis in competitive meetings, using nonverbal cues. For our study, we made use of {\^a}��The Apprentice{\^a}�� reality TV show, which features a competition for a real, highly paid corporate job. Our analysis is centered around two tasks regarding a person{\^a}��s role in a meeting: predicting the person with the highest status and predicting the fired candidates. The current study was carried out using nonverbal audio cues. Results obtained from the analysis of a full season of the show, representing around 90 minutes of audio data, are very promising (up to 85.7% of accuracy in the first case and up to 92.8% in the second case). Our approach is based only on the nonverbal interaction dynamics during the meeting without relying on the spoken words.},
author = {Bogdan,
Vitria and Gatica-Perez,
Daniel},
projects = {Idiap},
}

@article{Jayagopi_IEEETRANS.ONMULTIMEDIA_2010,
title = {Mining group nonverbal conversational patterns using probabilistic topic models},
author = {Jayagopi, D.  and Gatica-Perez, D. },
journal = {IEEE Transactions on Multimedia},
year = {2010},
keywords = {Report_IX, IM2.IP3, Group Bourlard, article},
abstract = {The automatic discovery of group conversational behavior is a relevant problem in social computing. In this paper, we present an approach to address this problem by defining a novel group descriptor called bag of group-nonverbal-patterns defined on brief observations of group interaction, and by using principled probabilistic topic models to discover topics. The proposed bag of group NVPs allows fusion of individual cues and facilitates the eventual comparison of groups of varying sizes. The use of topic models helps to cluster group interactions and to quantify how different they are from each other in a formal probabilistic sense. Results of behavioral topics discovered on the Augmented Multi-Party Interaction (AMI) meeting corpus are shown to be meaningful using human annotation with multiple observers. Our method facilitates {\^a}��group behaviour-based{\^a}�� retrieval of group conversational segments without the need of any previous labeling.},
projects = {Idiap,
IM2},
}

@article{EPFL-ARTICLE-149422,
title = {An {I}nteractive {T}able for {S}upporting {P}articipation {B}alance in {F}ace-to-{F}ace {C}ollaborative {L}earning},
author = {Bachour, Khaled and Kaplan, Frédéric and Dillenbourg, Pierre},
journal = {{IEEE} {T}ransactions on {L}earning {T}echnologies},
year = {2010},
publisher = {Institute of Electrical and Electronics Engineers},
issn = {1939-1382},
doi = {na},
keywords = {Computer-Supported Collaborative Learning; Interactive Furniture; Ubiquitous Computing; Human-Computer Interaction, Report_IX, IM2.IP2, Group Dillenbourg, article},
abstract = {We describe an interactive table designed for supporting face-to-face collaborative learning. The table, Reflect, addresses the issue of unbalanced participation during group discussions. By displaying on its surface a shared visualization of member participation, Reflect is meant to encourage participants to avoid the extremes of over- and under-participation. We report on a user study that validates some of our hypotheses on the effect the table would have on its users. Namely we show that Reflect leads to more balanced collaboration, but only under certain conditions. We also show different effects the table has on over- and under-participators.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/149422},
documenturl = {http://infoscience.epfl.ch/record/149422/files/bachour-reflect-TLT.pdf},
oai-id = {oai:infoscience.epfl.ch:149422},
oai-set = {article; fulltext-public; TEL; fulltext},
review = {REVIEWED},
status = {ACCEPTED},
submitter = {168210},
unit = {CRAFT}
}

@article{EPFL-ARTICLE-149271,
title = {Roombots: {R}econfigurable {R}obots for {A}daptive {F}urniture},
author = {Sproewitz, Alexander and Pouya, Soha and Bonardi, Stéphane and van den Kieboom, Jesse and Moeckel, Rico and Billard, A.  and Dillenbourg, Pierre and Ijspeert, Auke},
journal = {{IEEE} {C}omputational {I}ntelligence {M}agazine, special issue on "Evolutionary and developmental approaches to robotics"},
year = {2010},
doi = {na},
keywords = {self-reconfiguring modular robots; reconfiguration ; adaptive furniture, Report_IX, IM2.IP2, Group Dillenbourg, article},
abstract = {Imagine a world in which our furniture moves around like legged robots, interacts with us, and changes shape and function during the day according to our needs. This is the long term vision we have in the Roombots project. To work towards this dream, we are developing modular robotic modules that have rotational degrees of freedom for locomotion as well as active connection mechanisms for runtime reconfiguration. A piece of furniture, e.g. a stool, will thus be composed of several modules that activate their rotational joints together to implement locomotor gaits, and will be able to change shape, e.g. transforming into a chair, by sequences of attachments and detachments of modules. In this article, we firstly present the project and the hardware we are currently developing. We explore how reconfiguration from a configuration A to a configuration B can be controlled in a distributed fashion. This is done using metamodules-two Roombots modules connected serially-that use broadcast signals and connections to a structured ground to collectively build desired structures without the need of a centralized planner. We then present how locomotion controllers can be implemented in a distributed system of coupled oscillators-one per degree of freedom-similarly to the concept of central pattern generators (CPGs) found in the spinal cord of vertebrate animals. The CPGs are based on coupled phase oscillators to ensure synchronized behavior and have different output filters to allow switching between oscillations and rotations. A stochastic optimization algorithm is used to explore optimal CPG configurations for different simulated Roombots structures.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/149271},
oai-id = {oai:infoscience.epfl.ch:149271},
oai-set = {article; TEL},
review = {REVIEWED},
status = {ACCEPTED},
submitter = {173229; 173229},
unit = {BIOROB}
}

@incollection{EPFL-CHAPTER-149123,
title = {Technology for {C}lassroom {O}rchestration},
author = {Dillenbourg, Pierre and Jermann, Patrick},
editor = {Khine, M. S.  and Saleh, I. M. },
booktitle = {New {S}cience of {L}earning},
year = {2010},
pages = {525--552},
publisher = {Springer Science Business Media},
doi = {10.1007/978-1-4419-5716-0_26},
keywords = {Educational Technology; Classroom; Orchestration; Ecosystem, Report_IX, IM2.IP2, Group Dillenbourg, incollection},
abstract = {We use different criteria to judge teaching methods and learning environments as researchers and teachers. As researchers, we tend to rely on learning gains measured in controlled conditions. As teacher, the skilled management of classroom constraints results in the impression that a specific design {\^a}��works well{\^a}�?. We describe fourteen design factors related to the metaphors of classroom orchestration and education ecosystems and illustrate their embodiment in three learning environments. These design factors provide a teacher-centric, integrated view of educational technologies in the classroom. We expand this list of factors to include the main constraints that designers should consider to address the difficult methodological issue of generalizing research results about the effectiveness of methods and designs.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/149123},
oai-id = {oai:infoscience.epfl.ch:149123},
oai-set = {chapter; TEL},
status = {PUBLISHED},
submitter = {157873},
unit = {CRAFT}
}

@inproceedings{BIOROB-CONF-2009-002,
title = {Roombots-{M}echanical {D}esign of {S}elf-{R}econfiguring {M}odular {R}obots for {A}daptive {F}urniture},
author = {Sproewitz, Alexander and Billard, A.  and Dillenbourg, Pierre and Ijspeert, Auke Jan},
booktitle = {Proceedings of 2009 {IEEE} {I}nternational {C}onference on {R}obotics and {A}utomation},
year = {2009},
pages = {4259--4264},
location = {Kobe, Japan},
doi = {10.1109/robot.2009.5152613},
keywords = {self reconfiguring modular robots; active connection mechanism; furniture; mechanical design; quadruped robotics; biorob_roombots, Report_IX, IM2.IP2, Group Dillenbourg, inproceedings},
abstract = {We aim at merging technologies from information technology, roomware, and robotics in order to design adaptive and intelligent furniture. This paper presents design principles for our modular robots, called Roombots, as future building blocks for furniture that moves and self-reconfigures. The reconfiguration is done using dynamic connection and disconnection of modules and rotations of the degrees of freedom. We are furthermore interested in applying Roombots towards adaptive behaviour, such as online learning of locomotion patterns. To create coordinated and efficient gait patterns, we use a Central Pattern Generator (CPG) approach, which can easily be optimized by any gradient-free optimization algorithm. To provide a hardware framework we present the mechanical design of the Roombots modules and an active connection mechanism based on physical latches. Further we discuss the application of our Roombots modules as pieces of a homogenic or heterogenic mix of building blocks for static structures.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/142734},
documenturl = {http://infoscience.epfl.ch/record/142734/files/ICRA2009_sproewit_preprint.pdf},
oai-id = {oai:infoscience.epfl.ch:142734},
oai-set = {conf; fulltext; fulltext-public; TEL},
review = {REVIEWED},
status = {PUBLISHED},
unit = {BIOROB LASA}
}

@inproceedings{CRAFT-CONF-2009-020,
title = {Using {A}ugmentations as {B}ridges from {C}oncrete to {A}bstract {R}epresentations},
author = {Zufferey, Guillaume and Jermann, Patrick and Do Lenh, Son and Dillenbourg, Pierre},
booktitle = {Proceedings of the 23rd {B}ritish {HCI} {G}roup {A}nnual {C}onference on {HCI} 2009: {C}elebrating {P}eople and {T}echnology},
year = {2009},
pages = {130--139},
publisher = {British Computer Society},
location = {Cambridge (UK)},
url = {http://www.hci2009.org},
keywords = {Tangible User Interfaces; Paper-based Interaction; Multiple External Representations; Augmented Reality; Vocational Training, Report_IX, IM2.IP2, Group Dillenbourg, inproceedings},
abstract = {We describe a pedagogical approach supporting the acquisition of abstraction skills by apprentices in logistics. Apprentices start with a concrete representation in the form of a small-scale model which aims at engaging them in learning activities. Multiple External Representations are used to progressively introduce more abstract representations displayed on paper-based forms called TinkerSheets. We present the implementation of this approach on the TinkerTable, a tabletop learning environment which is used in two professional schools by four different teachers. We report observations of the use of the environment at different stages of the curriculum with first- and second-year apprentices.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/140681},
documenturl = {http://infoscience.epfl.ch/getfile.py?recid=140681&mode=best},
oai-id = {oai:infoscience.epfl.ch:140681},
oai-set = {conf; fulltext-public; fulltext; TEL},
review = {REVIEWED},
status = {PUBLISHED},
unit = {CRAFT}
}

@inproceedings{CRAFT-CONF-2009-019,
title = {Collaboration and abstract representations: towards predictive models based on raw speech and eye-tracking data},
author = {Nuessli, Marc-Antoine and Jermann, Patrick and Sangin, Mirweis and Dillenbourg, Pierre},
booktitle = {{CSCL} '09: {P}roceedings of the 2009 conference on {C}omputer support for collaborative learning},
year = {2009},
publisher = {International Society of the Learning Sciences},
location = {Rhodes},
note = {Invited Paper},
url = {http://www.isls.org/cscl2009/},
keywords = {Report_IX, IM2.IP2, Group Dillenbourg, inproceedings},
abstract = {This study aims to explore the possibility of using machine learning techniques to build predictive models of performance in collaborative induction tasks. More specifically, we explored how signal-level data, like eye-gaze data and raw speech may be used to build such models. The results show that such low level features have effectively some potential to predict performance in such tasks. Implications for future applications design are shortly discussed.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/140432},
documenturl = {http://infoscience.epfl.ch/getfile.py?recid=140432&mode=best},
oai-id = {oai:infoscience.epfl.ch:140432},
oai-set = {conf; fulltext-public; fulltext; TEL},
review = {REVIEWED},
status = {PUBLISHED},
unit = {CRAFT}
}

@incollection{CRAFT-CHAPTER-2009-001,
title = {Interpersonal {C}omputers for {H}igher {E}ducation},
author = {Kaplan, F and Do-Lenh, S and Bachour, K and Kao, G. Y and Gault, C and Dillenbourg, P},
editor = {Dillenbourg, P and Huang, J and Cherubini, M},
booktitle = {Interactive {A}rtifacts and {F}urniture {S}upporting {C}ollaborative {W}ork and {L}earning},
series = {Computer-Supported Collaborative Learning Series},
year = {2009},
pages = {129--145},
publisher = {Springer US},
keywords = {Report_IX, IM2.IP2, Group Dillenbourg, incollection},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/131124},
oai-id = {oai:infoscience.epfl.ch:131124},
oai-set = {chapter; TEL},
status = {PUBLISHED},
unit = {CRAFT}
}

@article{vajda10robust,
title = {Robust duplicate detection of {2D} and {3D} objects},
author = {Vajda, P.  and Ivanov, I.  and Goldmann, L.  and Lee, J. -S.  and Ebrahimi, T. },
journal = {International Journal of Multimedia Data Engineering and Management},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, article}
}

@article{ivanov10geotag,
title = {Geotag propagation in social networks based on user trust model},
author = {Ivanov, I.  and Vajda, P.  and Lee, J. -S.  and Goldmann, L.  and Ebrahimi, T. },
journal = {Multimedia Tools and Application},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, article}
}

@inproceedings{buchiner10gesture,
title = {Gesture and Touch Controlled Video Player Interface for Mobile Devices},
author = {Buchinger, S.  and Simone, F. De and Hotop, E.  and Hlavacs, H.  and Ebrahimi, T. },
booktitle = {Proceedings of the ACM Multimedia International Conference},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{lee10subjective,
title = {Subjective evaluation of scalable video coding for content distribution},
author = {Lee, J. -S.  and Simone, F. De and Ramzan, N.  and Zhao, Z.  and Kurutepe, E.  and Sikora, T.  and Ostermann, J.  and Izquierdo, E.  and Ebrahimi, T. },
booktitle = {Proceedings of the ACM Multimedia International Conference},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{koelstra10single,
title = {Single trial classification of {EEG} and peripheral physiological signals for recognition of emotions induced by music videos},
author = {Koelstra, S.  and Yazdani, A.  and Soleymani, M.  and Muehl, C.  and Lee, J. -S.  and Nijholt, A.  and Pun, T.  and Ebrahimi, T.  and Patras, I. },
booktitle = {Proceedings of the International Conference on Brain Informatics},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{ivanov10epitome,
title = {Epitome- a social game for photo album summarization},
author = {Ivanov, I.  and Vajda, P.  and Lee, J. -S.  and Ebrahimi, T. },
booktitle = {Proceedings of the International Workshop on Connected Multimedia},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{vajda10propagation,
title = {Propagation of geotags based on object duplicate detection},
author = {Vajda, P.  and Ivanov, I.  and Lee, J. -S.  and Goldmann, L.  and Ebrahimi, T. },
booktitle = {Proceedings of {SPIE}},
year = {2010},
volume = {7798},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{desimone10subjective,
title = {Subjective evaluation of next-generation video compression algorithm: a case study},
author = {Simone, F. De and Goldmann, L.  and Lee, J. -S.  and Ebrahimi, T.  and Baroncini, V. },
booktitle = {Proceedings of {SPIE}},
year = {2010},
volume = {7798},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{vajda103d,
title = {3{D} object duplicate detection for video retrieval},
author = {Vajda, P.  and Ivanov, I.  and Goldmann, L.  and Lee, J. -S.  and Ebrahimi, T. },
booktitle = {Proceedings of the International Workshop on Image Analysis for Multimedia Interactive Services},
year = {2010},
address = {Desenzano del Garda, Italy},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{ivanov10object,
title = {Object-based tag propagation for semi-automatic annotation of images},
author = {Ivanov, I.  and Vajda, P.  and Goldmann, L.  and Lee, J. -S.  and Ebrahimi, T. },
booktitle = {Proceedings of the ACM SIGMM International Conference on Multimedia Information Retrieval},
year = {2010},
pages = {497--506},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{desimone10h264,
title = {A {H.264/AVC} video database for the evaluation of quality metrics},
author = {Simone, F. De and Tagliasacchi, M.  and Naccari, M.  and Tubaro, S.  and Ebrahimi, T. },
booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing},
year = {2010},
pages = {2430--2433},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{desimone10on,
title = {On the limits of perceptually optimized {JPEG}},
author = {Simone, F. De and Goldmann, L.  and Filimonov, D.  and Ebrahimi, T. },
booktitle = {Proceedings of International Workshop on Video Processing and Quality Metrics for Consumer Electronics},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{goldmann10impact,
title = {Impact of acquisition distortion on the quality of stereoscopic images},
author = {Goldmann, L.  and Simone, F. De and Ebrahimi, T. },
booktitle = {Proceedings of International Workshop on Video Processing and Quality Metrics for Consumer Electronics},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{goldmann10comprehensive,
title = {A Comprehensive Database and Subjective Evaluation Methodology for Quality of Experience in Stereoscopic Video},
author = {Goldmann, L.  and Simone, F. De and Ebrahimi, T. },
booktitle = {Proceedings of SPIE},
year = {2010},
volume = {7526},
address = {San Jose, California, USA},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{vajda09analysis,
title = {Analysis of the limits of graph-based object duplicate detection},
author = {Vajda, P.  and Goldmann, L.  and Ebrahimi, T. },
booktitle = {Prooceedings of the IEEE International Symposium on Multimedia},
year = {2009},
pages = {600--605},
address = {San Diego, California, USA},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{yazdani09implicit,
title = {Implicit emotional tagging of multimedia using {EEG} signals and brain computer interface},
author = {Yazdani, A.  and Lee, J. -S.  and Ebrahimi, T. },
booktitle = {Proceedings of the International Workshop on Social Media},
year = {2009},
pages = {81--88},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{lee09efficient,
title = {Efficient video coding in {H.264/AVC} by using audio-visual information},
author = {Lee, J. -S.  and Ebrahimi, T. },
booktitle = {Proceedings of the IEEE International Workshop on Multimedia Signal Processing},
year = {2009},
address = {Rio de Janeiro, Brazil},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{desimone09subjective,
title = {Subjective evaluation of {JPEG XR} image compression},
author = {Simone, F. De and Goldmann, L.  and Baroncini, V.  and Ebrahimi, T. },
booktitle = {Proceedings of SPIE},
year = {2009},
volume = {7443},
address = {San Diego, California, USA},
keywords = {Report_IX, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{IlyaBoyandin201005,
title = {Using Flow Maps to Explore Migrations Over Time},
author = {Boyandin, Ilya and Bertini, E.  and Lalanne, D. },
booktitle = {Proceedings of Geospatial Visual Analytics Workshop in conjunction with The 13th AGILE International Conference on Geographic Information Science},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Ingold, inproceedings},
details = {�IM2.HMI�}
}

@inproceedings{FlorianEvequoz20109,
title = {La navigation par facettes appliquée {\a} la gestion de l'information personnelle},
author = {Ev\'equoz, F.  and Thomet, Julien and Lalanne, D. },
booktitle = {Proceedings of 22ème Conférence Francophone sur l'Interaction Homme-Machine (IHM'10)},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Ingold, inproceedings},
details = {�IM2.HMI�}
}

@inproceedings{DalilaMekhaldi200912,
title = {Joining Meeting Documents to Strengthen Multimodal Thematic Alignment},
author = {Mekhaldi, Dalila and Lalanne, D. },
booktitle = {Proceedings of 5th International Conference on Signal Image Technology and Internet Based Systems (SITIS 2009)},
year = {2009},
pages = {88--96},
keywords = {Report_IX, IM2.IP1, Group Ingold, inproceedings},
details = {�IM2.DMA�}
}

@inproceedings{PascalBruegger200912,
title = {A Method and Tools for Designing and Prototyping Activity-based Pervasive Applications},
author = {Bruegger, Pascal and Lalanne, D.  and Lisowska, A.  and Hirsbrunner, B. },
booktitle = {Proceedings of 7th International Conference on Advances in Mobile Computing & Multimedia (ACM MoMM 2009)},
year = {2009},
pages = {129--136},
keywords = {Report_IX, IM2.IP1, Group Ingold, inproceedings},
details = {�IM2.HMI�}
}

@article{EnricoBertini200912,
title = {Investigating and reflecting on the integration of automatic data analysis and visualization in knowledge discovery},
author = {Bertini, E.  and Lalanne, D. },
journal = {ACM SIGKDD Explorations},
year = {2009},
volume = {22},
keywords = {Report_IX, IM2.IP1, Group Ingold, article},
details = {�IM2.HMI�}
}

@inproceedings{FlorianEvequoz200911,
title = {"I Thought You Would Show Me How To Do It" -- Studying and Supporting PIM Strategy Changes},
author = {Ev\'equoz, F.  and Lalanne, D. },
booktitle = {Proceedings of ASIS&T PIM Workshop (ASIS&T 2009)},
year = {2009},
keywords = {Report_IX, IM2.IP1, Group Ingold, inproceedings},
details = {�IM2.HMI�}
}

@techreport{FlorianEvequoz20094,
title = {An Ethnographically-Inspired Survey of PIM Strategies. Technical Report},
author = {Ev\'equoz, F. },
year = {2009},
institution = {Department of Informatics, University of Fribourg, Switzerland},
keywords = {Report_IX, IM2.IP1, Group Ingold, techreport},
details = {�IM2.HMI�}
}

@article{BrunoDumas200911a,
title = {Description Languages for Multimodal Interaction: a Set of Guidelines},
author = {Dumas, B.  and Lalanne, D.  and Ingold, R. },
journal = {Journal on Multimodal User Interfaces},
year = {2009},
volume = {3},
keywords = {Report_IX, IM2.IP1, Group Ingold, article},
details = {�IM2.IP2�}
}

@inproceedings{BrunoDumas200911b,
title = {HephaisTK: A Toolkit for Rapid Prototyping of Multimodal Interfaces},
author = {Dumas, B.  and Lalanne, D.  and Ingold, R. },
booktitle = {Proceedings of International Conference on Multimodal Interfaces and Workshop on Machine Learning for Multi-modal Interaction (ICMI-MLMI 2009)},
year = {2009},
pages = {231--232},
keywords = {Report_IX, IM2.IP1, Group Ingold, inproceedings},
details = {�IM2.HMI�}
}

@book{Chen2010,
title = {Proceedings of the ACM-SIGIR 2010 conference},
author = {Chen, Hsin-Hsi and Efthimiadis, {Efthimis N.} and Savoy, Jacques and Crestani, Fabio and Marchand-Maillet, S. },
year = {2010},
publisher = {ACM Digital Library},
keywords = {Report_IX, IM2.IP1, Group Pun, book}
}

@incollection{Marchand-Maillet2010,
title = {Interactive Representations of Multimodal Databases},
author = {Marchand-Maillet, S.  and Morrison, D.  and Szekely, E.  and Bruno, E. },
editor = {Bourlard, H.  and Marques, F.  and Thiran, J. -Ph. },
booktitle = {Multimodal Signal Processing for Human Computer Interaction},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Pun, incollection}
}

@incollection{Marchand-Maillet2010a,
title = {Capturing the semantics of user interaction: {A} review and case study},
author = {Morrison, D.  and Bruno, E.  and Marchand-Maillet, S. },
editor = {Chbeir, R.  and Badr, Y.  and Abraham, A.  and Hassanien, A. -E. },
booktitle = {Emergent Web Intelligence: {A}dvanced Information Retrieval},
year = {2010},
publisher = {Springer},
keywords = {Report_IX, IM2.IP1, Group Pun, incollection}
}

@book{Kompatsiaris2010,
title = {Image and Video Retrieval: {T}heory and Applications},
author = {Kompatsiaris, I.  and Marchand-Maillet, S.  and Marcel, S.  and {van Zwol}, R. },
booktitle = {Multimedia Tools and Applications},
year = {2010},
publisher = {Springer},
keywords = {Report_IX, IM2.IP1, Group Pun, book}
}

@incollection{Kierkels2010,
title = {Identification of narrative peaks in clips: text features perform best},
author = {Kierkels, J.  and Soleymani, M.  and Pun, T. },
booktitle = {VideoCLEF 2009, Cross Language Evaluation Forum (CLEF) Workshop, Post-Conference Proceedings},
year = {2010},
publisher = {Springer LNCS},
keywords = {Report_IX, IM2.IP1, Group Pun, incollection}
}

@incollection{Deville2010,
title = {See Color: {S}eeing colours with an orchestra},
author = {Deville, B.  and Bologna, G.  and Vinckenbosch, M.  and Pun, T. },
editor = {Lalanne, D.  and Kohlas, J. },
booktitle = {Human Machine Interaction, Research Results of the MMI Program},
year = {2009},
pages = {251--279},
publisher = {Springer LNCS},
keywords = {Report_IX, IM2.IP1, Group Pun, incollection}
}

@inproceedings{Morrison2010,
author = {Morrison, D.  and Bruno, E.  and Marchand-Maillet, S. },
booktitle = {ACM Multimedia 2010},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{Koelstra2010,
title = {Single trial classification of EEG and peripheral physiological signals for recognition of emotions induced by music videos},
author = {Koelstra, S.  and Yazdani, A.  and Soleymani, M.  and Muehl, C.  and Lee, J. -S.  and Nijholt, A.  and Pun, T.  and Ebrahimi, T.  and Patras, I. },
booktitle = {Brain Informatics},
year = {2010},
keywords = {Report_IX, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{Kierkels2009,
title = {Identification of narrative peaks in clips: text features perform best},
author = {Kierkels, J.  and Soleymani, M.  and Pun, T. },
booktitle = {VideoCLEF 2009, Cross Language Evaluation Forum (CLEF) Workshop, ECDL 200},
year = {2009},
keywords = {Report_IX, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{SOleymani2009,
title = {A collaborative personalized affective video retrieval system},
author = {Soleymani, M.  and Davis, J.  and Pun, T. },
booktitle = {International Conference on Affective Computing and Intelligent Interaction},
year = {2009},
pages = {588--589},
keywords = {Report_IX, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{Soleymani2009_2,
title = {A Bayesian framework for video affective representation},
author = {Soleymani, M.  and Kierkels, J.  and Chanel, G.  and Pun, T. },
booktitle = {International Conference on Affective Computing and Intelligent Interaction},
year = {2009},
pages = {267--273},
keywords = {Report_IX, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{Kierkels2009_2,
title = {Simultaneous exploitation of explicit and implicit tags in affect-based multimedia retrieval},
author = {Kierkels, J.  and Pun, T. },
booktitle = {International Conference on Affective Computing and Intelligent Interaction},
year = {2009},
pages = {274--279},
keywords = {Report_IX, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{Kierkels2009_3,
title = {Queries and tags in affect-based multimedia retrieval},
author = {Kierkels, J.  and Soleymani, M.  and Pun, T. },
booktitle = {International Conference on Multimedia and Expo, Special Session on Implicit Tagging},
year = {2009},
keywords = {Report_IX, IM2.IP1, Group Pun, inproceedings}
}

@article{LTS-ARTICLE-2008-065a,
title = {Cooperative {O}bject {S}egmentation and {B}ehavior {I}nference in {I}mage {S}equences},
author = {Gui, L.  and Thiran, J. -Ph.  and Paragios, N. },
journal = {International {J}ournal of {C}omputer {V}ision},
year = {2009},
volume = {84},
number = {2},
pages = {146--162},
keywords = {image segmentation; behavior inference; gesture recognition; LTS5, Report_IX, IM2.IP1, Group Thiran, article},
abstract = {In this paper, we propose a general framework for fusing bottom-up segmentation with top-down object behavior inference over an image sequence. This approach is beneficial for both tasks, since it enables them to cooperate so that knowledge relevant to each can aid in the resolution of the other, thus enhancing the final result. In particular, the behavior inference process offers dynamic probabilistic priors to guide segmentation. At the same time, segmentation supplies its results to the inference process, ensuring that they are consistent both with prior knowledge and with new image information. The prior models are learned from training data and they adapt dynamically, based on newly analyzed images. We demonstrate the effectiveness of our framework via particular implementations that we have employed in the resolution of two hand gesture recognition applications. Our experimental results illustrate the robustness of our joint approach to segmentation and behavior inference in challenging conditions involving complex backgrounds and occlusions of the target object.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/125035},
documenturl = {http://infoscience.epfl.ch/getfile.py?recid=125035&mode=best},
extra-id = {000266477100003},
oai-id = {oai:infoscience.epfl.ch:125035},
oai-set = {article; fulltext-public; fulltext},
review = {REVIEWED},
status = {PUBLISHED},
unit = {LTS}
}

@article{EPFL-ARTICLE-148644,
title = {Modelling human perception of static facial expressions},
author = {Sorci, M.  and Antonini, G.  and Cruz Mota, J.  and Rubin, T.  and Bierlaire, M.  and Thiran, J. -Ph. },
journal = {Image and {V}ision {C}omputing},
year = {2010},
volume = {28},
number = {5},
pages = {790--806},
publisher = {Elsevier},
issn = {0262-8856},
doi = {10.1016/j.imavis.2009.10.003},
keywords = {Face; Facial expression; LTS5, Report_IX, IM2.IP1, Group Thiran, article},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/148644},
extra-id = {000275849900008},
oai-id = {oai:infoscience.epfl.ch:148644},
oai-set = {article},
review = {REVIEWED},
status = {PUBLISHED},
submitter = {115534},
unit = {LTS}
}

@inproceedings{LTS-CONF-2009-058,
title = {{SELECTING} {RELEVANT} {VISUAL} {FEATURES} {FOR} {SPEECHREADING}},
author = {Estellers, Virginia and Gurban, M.  and Thiran, J. -Ph. },
booktitle = {Proc. {o}f the {IEEE} {I}nternational {C}onference on {I}mage {P}rocessing},
year = {2009},
location = {Cairo},
url = {http://www.icip2009.org/},
keywords = {LTS5; Feature extraction; image processing; speech recognition, Report_IX, IM2.IP1, Group Thiran, inproceedings},
abstract = {A quantitative measure of relevance is proposed for the task of constructing visual feature sets which are at the same time relevant and compact. A feature's relevance is given by the amount of information that it contains about the problem, while compactness is achieved by preventing the replication of information between features in the set. To achieve these goals, we use mutual information both for assessing relevance and measuring the redundancy between features. Our application is speechreading, that is, speech recognition performed on the video of the speaker. This is justified by the fact that the performance of audio speech recognition can be improved by augmenting the audio features with visual ones, especially when there is noise in the audio channel. We report significant improvements compared to the most commonly used method of dimensionality reduction for speechreading, linear discriminant analysis.},
affiliation = {EPFL},
details = {http://infoscience.epfl.ch/record/138674},
documenturl = {http://infoscience.epfl.ch/getfile.py?recid=138674&mode=best},
oai-id = {oai:infoscience.epfl.ch:138674},
oai-set = {conf},
review = {REVIEWED},
status = {PUBLISHED},
unit = {LTS}
}

@article{VanGool001,
title = {Robust Multi-Person Tracking from a Mobile Platform},
author = {Ess, A.  and Leibe, B.  and Schindler, K.  and Gool, L. Van},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
year = {2009},
volume = {31},
number = {10},
pages = {1831--1846},
keywords = {Report_IX, IM2.IP3, Group VanGool, article}
}

@article{VanGool002,
title = {Shape-from-Recognition: Recognition enables Meta-data Transfer},
author = {Thomas, A.  and Ferrari, V.  and Leibe, B.  and Tuytelaars, T.  and Gool, L. Van},
journal = {Computer Vision and Image Understanding},
year = {2009},
volume = {113},
number = {12},
pages = {1222--1234},
keywords = {Report_IX, IM2.IP3, Group VanGool, article}
}

@article{VanGool003,
title = {Using Multi-view Recognition to Guide a Robot},
author = {Thomas, A.  and Ferrari, V.  and Leibe, B.  and Tuytelaars, T.  and Gool, L. Van},
journal = {International Journal of Robotics Research},
year = {2009},
volume = {28},
number = {8},
pages = {976--998},
keywords = {Report_IX, IM2.IP3, Group VanGool, article}
}

@article{VanGool004,
title = {Multi-object tracking evaluated on sparse events},
author = {Roth, D.  and Koller-Meier, E.  and Gool, Luc Van},
journal = {Multimedia Tools and Applications},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, article}
}

@article{VanGool005,
title = {Automated Recognition of 3D CAD Objects in Site Laser Scans for Project 3D Status Visualization and Performance Control},
author = {Bosché, F.  and Haas, C. T.  and Akinci, B. },
journal = {ASCE Journal of Computing in Civil Engineering},
year = {2009},
volume = {23},
number = {6},
pages = {311--318},
keywords = {Report_IX, IM2.IP3, Group VanGool, article}
}

@article{VanGool006,
title = {Fast and Automatic Object Pose Estimation for Range Images on the GPU},
author = {Park, I. K.  and Germann, M.  and Breitenstein, M. D.  and Pfister, H. },
journal = {Machine Vision and Applications},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, article}
}

@article{VanGool007,
title = {Real-time Body Pose Recognition using 2D or 3D Haarlets},
author = {den Bergh, Michael Van and Koller-Meier, E.  and Gool, Luc Van},
journal = {International Journal of Computer Vision},
year = {2009},
volume = {83},
pages = {72--84},
keywords = {Report_IX, IM2.IP3, Group VanGool, article}
}

@article{VanGool008,
title = {Procedural Modeling for Digital Cultural Heritage},
author = {Haegler, S.  and Müller, P.  and Gool, Luc Van},
journal = {EURASIP Journal on Image and Video Processing},
year = {2009},
volume = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, article}
}

@incollection{VanGool009,
title = {Real-time 3D Body Pose Estimation},
author = {den Bergh, Michael Van and Kehl, Roland and Koller-Meier, E.  and Gool, Luc Van},
editor = {Aghajan, Hamid and Cavallaro, Andrea},
booktitle = {Multi-Camera Networks: Concepts and Applications},
year = {2009},
pages = {335--360},
publisher = {Elsevier},
keywords = {Report_IX, IM2.IP3, Group VanGool, incollection}
}

@incollection{VanGool010,
title = {An introduction to kernel learning algorithms},
author = {Gehler, Peter and Schölkopf, Bernhard},
editor = {Camps-Valls, Gustavo and Bruzzone, Lorenzo},
booktitle = {Kernel Methods for Remote Sensing Data Analysis},
year = {2009},
pages = {39--60},
publisher = {Wiley},
keywords = {Report_IX, IM2.IP3, Group VanGool, incollection}
}

@incollection{VanGool011,
title = {2D Human Pose Estimation in TV Shows},
author = {Ferrari, V.  and Marin, M.  and and A. Zisserman },
editor = {Cremers, D.  and Rosenhahn, B.  and Yuille, A.  and Schmidt, F. },
booktitle = {Statistical and Geometrical Approaches to Visual Motion Analysis},
year = {2009},
pages = {128--147},
publisher = {Springer},
keywords = {Report_IX, IM2.IP3, Group VanGool, incollection}
}

@inproceedings{VanGool012,
title = {A Hough Transform-Based Voting Framework for Action Recognition},
author = {Yao, A.  and Gall, J.  and Gool, L. Van},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition},
year = {2010},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool013,
title = {What�s going on? Discovering Spatio-Temporal Dependencies in Dynamic Scenes},
author = {Kuettel, D.  and Breitenstein, M. D.  and Gool, Luc Van and Ferrari, V. },
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition},
year = {2010},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool014,
title = {Exploiting simple hierarchies for unsupervised human behavior analysis},
author = {Nater, Fabian and Grabner, Helmut and Gool, Luc Van},
booktitle = {CVPR},
year = {2010},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool015,
title = {Visual abnormal event detection for prologed independent livin},
author = {Nater, Fabian and Grabner, Helmut and Gool, Luc Van},
booktitle = {IEEE Healthcom Workshop on mHealth},
year = {2010},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool016,
title = {3D Vision Technology for Capturing Multimodal Corpora: Chances and Challenges},
author = {Fanelli, G.  and Gall, J.  and Romsdorfer, H.  and Weise, T.  and Gool, L. Van},
booktitle = {LREC Workshop on Multimodal Corpora},
year = {2010},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool017,
title = {2D Action Recognition Serves 3D Human Pose Estimation},
author = {Gall, J.  and Yao, A.  and Gool, L. Van},
booktitle = {European Conference on Computer Vision},
year = {2010},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool018,
title = {On-line Adaption of Class-specific Codebooks for Instance Tracking},
author = {Gall, J.  and Razavi, N.  and Gool, L. Van},
booktitle = {British Machine Vision Conference},
year = {2010},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool019,
title = {Evaluation of Agent Motion in Video: Online Tracking-by-Detection},
author = {Breitenstein, M. D.  and Leibe, B.  and Gool, Luc Van},
booktitle = {International Conference on Cognitive Systems},
year = {2010},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool020,
title = {Grammar-Based Encoding of Facades},
author = {Haegler, S.  and Wonka, P.  and Arisona, Stefan Mueller and Gool, Luc Van and Müller, P. },
booktitle = {EGSR},
year = {2010},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool021,
title = {Moving Obstacle Detection in Highly Dynamic Scenes},
author = {Ess, A.  and Leibe, B.  and Schindler, K.  and Gool, L. Van},
booktitle = {IEEE International Conference on Robotics and Automation},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool022,
title = {Improved Multi-Person Tracking with Active Occlusion Handling},
author = {Ess, A.  and Schindler, K.  and Leibe, B.  and van Gool, L. },
booktitle = {ICRA Workshop on People Detection and Tracking},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool023,
title = {Feature-Centric Efficient Subwindow Search},
author = {Lehmann, Alain and Leibe, B.  and Gool, Luc Van},
booktitle = {IEEE International Conference on Computer Vision},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool024,
title = {PRISM: PRincipled Implicit Shape Model},
author = {Lehmann, Alain and Leibe, B.  and Gool, Luc Van},
booktitle = {British Machine Vision Conference},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool025,
title = {Better Appearance Models for Pictorial Structures},
author = {Eichner, Marcin and Ferrari, V. },
booktitle = {British Machine Vision Conference},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool026,
title = {Tracker Trees for Unusual Event Detectio},
author = {Nater, Fabian and Grabner, Helmut and Jaeggli, T.  and Gool, Luc Van},
booktitle = {IEEE International Workshop on Visual Surveillance},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool027,
title = {Hough Transform-based Mouth Localization for Audio-Visual Speech Recognition},
author = {Fanelli, G.  and Gall, J.  and Gool, L. Van},
booktitle = {British Machine Vision Conference},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool028,
title = {Evaluation of 3D City Models Using Automatic Placed Urban Agents},
author = {Aschwanden, Gideon and Haegler, S.  and Halatsch, Jan and Jecker, Raphael and Schmitt, Gerhard and Gool, Luc Van},
booktitle = {CONVR},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool029,
title = {Tracking a Hand Manipulating an Object},
author = {Hamer, Henning and Schindler, K.  and Koller-Meier, E.  and Gool, Luc Van},
booktitle = {IEEE International Conference on Computer Vision},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool030,
title = {Class-Specific Hough Forests for Object Detection},
author = {Gall, J.  and Lempitsky, V. },
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool031,
title = {Motion Capture Using Joint Skeleton Tracking and Surface Estimation},
author = {Gall, J.  and Stoll, C.  and de Aguiar, E.  and Theobalt, C.  and Rosenhahn, B.  and Seidel, H. -P. },
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool032,
title = {Unsupervised Face Alignment by Robust Nonrigid Mapping},
author = {Zhu, J.  and Gool, L. Van and Hoi, S. C. },
booktitle = {ICCV2009},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool033,
title = {Rome Reborn 2.0: A Framework for Virtual City Reconstruction Using Procedural Modeling Techniques},
author = {Dylla, K.  and Müller, P.  and Ulmer, A.  and Haegler, S.  and Fischer, B. },
booktitle = {Proceedings of Computer Applications and Quantitative Methods in Archaeology},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool034,
title = {Who's Doing What: Joint Modeling of Names and Verbs for Simultaneous Face and Pose Annotation},
author = {Jie, L.  and Caputo, Barbara and Ferrari, V. },
booktitle = {Advances in Neural Information Processing Systems},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool035,
title = {Distance Metric Learning from Uncertain Side Information with Application to Automated Photo Taggin},
author = {Wu, L.  and Hoi, S. C.  and Jin, R.  and Zhu, J.  and Yu, N. },
booktitle = {ACM Multimedia 2009},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool036,
title = {A Comparison of 3D Model-based Tracking Approaches for Human Motion Capture in Uncontrolled Environments},
author = {Shaheen, M.  and Gall, J.  and Strzodka, R.  and Gool, L. Van and Seidel, H. -P. },
booktitle = {IEEE Workshop on Applications of Computer Vision},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool037,
title = {Haarlet-based Hand Gesture Recognition for 3D Interaction},
author = {den Bergh, M. Van and Bosche, F.  and Koller-Meier, E.  and Gool, L. Van},
booktitle = {Proceedings of the IEEE Workshop on Applications of Computer Vision},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool038,
title = {Towards Collaborative Interaction with Large nD Models for Effective Project Management},
author = {den Bergh, M. Van and Halatsch, J.  and Kunze, A.  and Bosche, F.  and Gool, L. Van and Schmitt, G. },
booktitle = {9th International Conference on Construction Applications of Virtual Reality},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool039,
title = {Robust Tracking-by-Detection using a Detector Confidence Particle Filter},
author = {Breitenstein, M. D.  and Reichlin, Fabian and Leibe, B.  and Koller-Meier, E.  and Gool, Luc Van},
booktitle = {IEEE International Conference on Computer Vision},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool040,
title = {Hunting Nessie -- Real-Time Abnormality Detection from Webcams},
author = {Breitenstein, M. D.  and Grabner, Helmut and Gool, Luc Van},
booktitle = {IEEE International Workshop on Visual Surveillance},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool041,
title = {A Distributed Camera System for Multi-Resolution Surveillance},
author = {Bellotto, N.  and Sommerlade, E.  and Benfold, B.  and Bibby, C.  and Reid, I.  and Roth, D.  and Gool, L. Van and Fernandez, C.  and Gonzalez, J. },
booktitle = {Third ACM/IEEE International Conference on Distributed Smart Cameras},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool042,
title = {Markerless Motion Capture with Unsynchronized Moving Cameras},
author = {Hasler, N.  and Rosenhahn, B.  and Thorm{üa}hlen, T.  and Wand, M.  and Gall, J.  and Seidel, H. -P. },
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool043,
title = {Let the Kernel Figure it Out: Principled Learning of Pre-processing for Kernel Classifiers},
author = {Gehler, Peter and Nowozin, Sebastian},
booktitle = {Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool044,
title = {On Feature Combination for Multiclass Object Classification},
author = {Gehler, Peter and Nowozin, Sebastian},
booktitle = {Proceedings of the Twelfth IEEE International Conference on Computer Vision},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool045,
title = {I know what you did last summer: object-level auto-annotation of holiday snaps},
author = {Gammeter, S.  and Bossard, L.  and Quack, T.  and Gool, L. Van},
booktitle = {International Conference on Computer Vision},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool046,
title = {Beyond Semi-Supervised Tracking: Tracking Should Be as Simple as Detection, but not Simpler than Recognition},
author = {Stalder, S.  and Grabner, H.  and Gool, L. Van},
booktitle = {OLCV 09: 3rd On-line learning for Computer Vision Workshop},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool047,
title = {You'll Never Walk Alone: Modeling Social Behavior for Multi-target Tracking},
author = {Pellegrini, S.  and Ess, A.  and Schindler, K.  and van Gool, L. },
booktitle = {International Conference on Computer Vision},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool048,
title = {Deformation-aware Log-Linear Models},
author = {Gass, T.  and Deselaers, T.  and Ney, H. },
booktitle = {Deutsche Arbeitsgemeinschaft für Mustererkennung Symposium},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool049,
title = {In-hand Scanning with Online Loop Closure},
author = {Weise, T.  and Wismer, T.  and Leibe, B.  and Gool, L. Van},
booktitle = {IEEE International Workshop on 3-D Digital Imaging and Modeling},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@inproceedings{VanGool050,
title = {Log-Linear Mixtures for Object Recognition},
author = {Weyand, T.  and Deselaers, T.  and Ney, H. },
booktitle = {British Machine Vision Conference},
year = {2009},
keywords = {Report_IX, IM2.IP3, Group VanGool, inproceedings}
}

@article{vangool001a,
title = {Fast {PRISM}: Branch and Bound Hough Transform for Object Class Detection},
author = {Lehmann, A.  and Leibe, B.  and Gool, L. Van},
journal = {International Journal of Computer Vision},
year = {2011},
volume = {94},
number = {2},
pages = {175--197},
keywords = {report_X, IM2.IP3, Group van Gool, article}
}

@article{vangool002a,
title = {Empiric Design Evaluation in Urban Planning},
author = {Aschwanden, G.  and Haegler, S.  and Bosché, F.  and Gool, L. Van and Schmitt, G. },
journal = {Automation in Construction},
year = {2011},
volume = {20},
number = {3},
pages = {299--310},
keywords = {report_X, IM2.IP3, Group van Gool, article}
}

@article{vangool003a,
title = {A 3-D Audio-Visual Corpus of Affective Communication},
author = {Fanelli, G.  and Gall, J.  and Romsdorfer, H.  and Weise, T.  and Gool, L. Van},
journal = {IEEE Transactions on Multimedia},
year = {2010},
volume = {12},
number = {6},
pages = {591--598},
keywords = {report_X, IM2.IP3, Group van Gool, article}
}

@article{vangool004a,
title = {Online Multi-Person Tracking-by-Detection from a Single, Uncalibrated Camera},
author = {Breitenstein, M. D.  and Reichlin, F.  and Leibe, B.  and Koller-Meier, E.  and Gool, L. Van},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, article}
}

@inproceedings{vangool005a,
title = {Real Time Head Pose Estimation with Random Regression Forest},
author = {Fanelli, G.  and Gall, J.  and Gool, L. Van},
booktitle = {Computer Vision and Pattern Recognition (CVPR)},
year = {2011},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool006a,
title = {Data-Driven Animation of Hand-Object Interactions},
author = {Hamer, H.  and Gall, J.  and Urtasun, R.  and Gool, L. Van},
booktitle = {IEEE Conference on Automatic Face and Gesture Recognition},
year = {2011},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool007a,
title = {Functional Categorization of Objects using Real-time Markerless Motion Capture},
author = {Gall, J.  and Fossati, A.  and Gool, L. Van},
booktitle = {Computer Vision and Pattern Recognition (CVPR)},
year = {2011},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool008a,
title = {Automatic Architectural Style Recognition},
author = {Mathias, M.  and Martinovic, A.  and Weissenberg, J.  and Haegler, S.  and Gool, L. Van},
booktitle = {3D-ARCH 2011: �3D Virtual Reconstruction and Visualization of Complex Architecture},
year = {2011},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool009a,
title = {Scalable Multi-class Object Detection},
author = {Razavi, N.  and Gall, J.  and Gool, L. Van},
booktitle = {Proceedings of the IEEE International Conference on Computer Vision and Pattern Recognition (CVPR)},
year = {2011},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool010a,
title = {Scene Carving: Scene Consistent Image Retargeting},
author = {Mansfield, A.  and Gehler, P.  and Gool, L. Van and Rother, C. },
booktitle = {European Conference on Computer Vision (ECCV)},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool011a,
title = {Visibility Maps for Improving Seam Carving},
author = {Mansfield, A.  and Gehler, P.  and Gool, L. Van and Rothe, C. },
booktitle = {Media Retargeting Workshop, European Conference on Computer Vision (ECCV)},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool012a,
title = {Tracking in Broadcast Sports},
author = {Yao, A.  and Uebersax, D.  and Gall, J.  and Gool, L. Van},
booktitle = {32nd Annual Symposium of the German Association for Pattern Recognition},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool013a,
title = {Object Fow: Learning object displacement},
author = {Lalos, C.  and Grabner, H.  and Gool, L. Van and Varvarigo, T. },
booktitle = {roceeding IEEE Workshop on Visual Surveillance},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool014a,
title = {Discrimination of locomotion direction at different speeds: A comparison between macaque monkeys and algorithms},
author = {Nater, F.  and Vangeneugden, J.  and Grabner, H.  and Gool, L. Van and Vogels, R. },
booktitle = {ECML Workshop on rare audio-visual cues},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool015a,
title = {Hough Forest-Based Facial Expression Recognition from Video Sequences},
author = {Fanelli, G.  and A.Yao  and Noel, P. -L.  and Gall, J.  and Gool, L. Van},
booktitle = {International Workshop on Sign, Gesture and Activity (SGA) 2010, in conjunction with ECCV 2010},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool016a,
title = {Automatic Workflow Monitoring in Industrial Environments},
author = {Veres, G.  and Grabner, H.  and Middleton, L.  and Gool, L. Van},
booktitle = {Proceedings Asian Conference on Computer Vision (ACCV)},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool018a,
title = {Orientation invariant 3D object Classification using Hough Transform based methods},
author = {Knopp, J.  and Prasad, M.  and Gool, L. Van},
booktitle = {Proceedings of the ACM workshop on 3D object retrieval},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool019a,
title = {Hough Transform and {3D SURF} for robust three dimensional classification},
author = {Knopp, J.  and Prasad, M.  and Willems, G.  and Timofte, R.  and Gool, L. Van},
booktitle = {Proceedings of the European Conference on Computer Vision},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool020a,
title = {On-line Adaption of Class-specific Codebooks for Instance Trackin},
author = {Gall, J.  and Razavi, N.  and Gool, Luc Van},
booktitle = {British Machine Vision Conference},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool021a,
title = {Backprojection Revisited: Scalable Multi-view Object Detection and Similarity Metrics for Detections},
author = {Razavi, N.  and Gall, J.  and Gool, Luc Van},
booktitle = {European Conference on Computer Vision},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool022a,
title = {Size does matter: improving object recognition and 3D reconstruction with cross-media analysis of image clusters},
author = {Gammeter, S.  and Quack, T.  and Tingdahl, D.  and van Gool, Luc},
booktitle = {European Conference on Computer Vision (ECCV 2010},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool023a,
title = {Cascaded Confidence Filtering for Improved Tracking-by-Detectio},
author = {Stalder, S.  and Grabner, H.  and Gool, L. Van},
booktitle = {European Conference on Computer Vision (ECCV)},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool024a,
title = {Improving Data Association by Joint Modeling of Pedestrian Trajectories and Groupings},
author = {Pellegrini, S.  and Ess, A.  and Gool, L. Van},
booktitle = {European Conference on Computer Vision (ECCV)},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{vangool025a,
title = {Wrong Turn - No Dead End: a Stochastic Pedestrian Motion Model},
author = {Pellegrini, S.  and Ess, A.  and Tanaskovic, M.  and Gool, L. Van},
booktitle = {International Workshop on Socially Intelligent Surveillance and Monitoring (SISM)},
year = {2010},
keywords = {report_X, IM2.IP3, Group van Gool, inproceedings}
}

@inproceedings{EPFL-CONF-167581,
title = {Head pose detection using {F}ast {R}obust {PCA} for {S}ide {A}ctive {A}ppearance {M}odels under {O}cclusion},
author = {Yüce, Anil and Sorci, M.  and Thiran, J. -Ph. },
booktitle = {Proceeding of the {T}he 2011 {I}nternational {C}onference on {I}mage {P}rocessing, {C}omputer {V}ision, and {P}attern {R}ecognition ({IPCC} 2011)},
year = {2011},
keywords = {report_X, IM2.IP1, Group Pun, article}
}

@inproceedings{yuce2011,
title = {Head pose detection using Fast Robust PCA for Side Active Appearance Models under Occlusion},
author = {Yüce, A.  and Sorci, M.  and Thiran, J. -Ph. },
booktitle = {International Conference on Image Processing, Computer Vision, and Pattern Recognition (IPCV 2011)},
year = {2011},
keywords = {report_X, IM2.IP1, Group Pun, article}
}

@article{pun001,
title = {{DEAP}: A database for emotion analysis using physiological signal},
author = {Koelstra, S.  and Mühl, C.  and Soleymani, M.  and Lee, J. -S.  and Yazdani, A.  and Ebrahimi, T.  and Pun, T.  and Nijholt, A.  and Patras, I. },
journal = {IEEE Trans. on Affective Computing, Special Issue on Naturalistic Affect Resources for System Building and Evaluation},
year = {2011},
keywords = {report_X, IM2.IP1, Group Pun, article}
}

@article{pun002,
title = {Toward local and global perception modules for vision substitution},
author = {Bologna, G.  and Deville, B.  and Pun, T. },
journal = {Neurocomputing},
year = {2010},
volume = {74},
number = {8},
pages = {1182--1190},
keywords = {report_X, IM2.IP1, Group Pun, article}
}

@article{pun003,
title = {Emotion assessment from physiological signals for adaptation of games difficulty},
author = {Chanel, G.  and Rebetez, C.  and Betrancourt, M.  and T, Pun},
journal = {IEEE Trans. on Systems, Man, and Cybernetics - Part A: Systems and Humans},
year = {2011},
keywords = {report_X, IM2.IP1, Group Pun, article}
}

@book{pun005,
title = {Proceeding of the 33rd International ACM SIGIR Conference on Research and Development in Information Retrieval, SIGIR 2010},
author = {Crestani, F.  and Marchand-Maillet, S.  and Chen, H. -H.  and Efthimiadis, E. N.  and Savoy, J. },
year = {2010},
publisher = {ACM, New York, USA},
keywords = {report_X, IM2.IP1, Group Pun, book}
}

@inproceedings{pun006,
title = {Multisource sonification for visual substitution in an auditory memory game: one, or two fingers?},
author = {Gomez, J.  and Bologna, G.  and Deville, B.  and Pun, T. },
booktitle = {ICAD 2011, Int. Conf. on Auditory Display},
year = {2011},
keywords = {report_X, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{pun007,
title = {Automatic tagging and geo-tagging in video collections and communities},
author = {Larson, M.  and Soleymani, M.  and Serdyukov, P.  and Rudinac, S.  and Wartena, C.  and Friedland, G.  and Murdock, V.  and Ordelman, R.  and Jonesv, G. J. F. },
booktitle = {ACM Int. Conf. on Multimedia Retrieval (ICMR) 2011},
year = {2011},
keywords = {report_X, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{pun008,
title = {Continuous emotion detection in response to music videos},
author = {Soleymani, M.  and Koelstra, S.  and Patras, I.  and Pun, T. },
booktitle = {EmoSPACE 2011, 1st Int. Workshop on Emotion Synthesis, rePresentation, and Analysis in Continuous spacE, in conjunction with IEEE FG 2011},
year = {2011},
keywords = {report_X, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{pun009,
title = {Color-audio encoding interface for visual substitution: See Color Matlab-based demo},
author = {Gomez, J. D.  and Bologna, G.  and Pun, T. },
booktitle = {ASSETS 2010, 12th Int. ACM SigAccess Conf. on Computers and Accessibility, Demonstrations Track},
year = {2010},
keywords = {report_X, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{pun010,
title = {Detecting objects and obstacles for visually impaired individuals using visual saliency},
author = {Deville, B.  and Bologna, G.  and Pun, T. },
booktitle = {ASSETS 2010, 12th Int. ACM SigAccess Conf. on Computers and Accessibility, Demonstrations Track},
year = {2010},
keywords = {report_X, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{pun011,
title = {Crowdsourcing for affective annotation of video: development of a viewer-reported boredom corpus},
author = {Soleymani, M.  and Larson, M. },
booktitle = {33th ACM SIGIR, Workshop on Crowdsourcing for Search Evaluatio},
year = {2010},
keywords = {report_X, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{pun013,
title = {Effective Multimodal Information Fusion by Structure Learning},
author = {Kludas, J.  and Marchand-Maillet, S. },
booktitle = {14th International Conference on Information Fusion (FUSION 2011)},
year = {2011},
keywords = {report_X, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{pun014,
title = {A Parallel Cross-Modal Search Engine over Large-Scale Multimedia Collections with Interactive Relevance Feedback},
author = {von Wyl, M.  and Mohamed, H.  and Bruno, E.  and Marchand-Maillet, S. },
booktitle = {ACM International Conference on Multimedia Retrieval (ACM-ICMR'11)},
year = {2011},
keywords = {report_X, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{pun015,
title = {Query log simulation for long-term learning in image retrieval},
author = {Morrison, D.  and Bruno, E.  and Marchand-Maillet, S. },
booktitle = {ontent-based Multimedia Indexinding (CBMI'11)},
year = {2011},
keywords = {report_X, IM2.IP1, Group Pun, inproceedings}
}

@inproceedings{pun016,
author = {Morrison, D.  and Bruno, E.  and Marchand-Maillet, S. },
booktitle = {ACM MULTIMEDIA 2010 (Demo Program)},
year = {2010},
keywords = {report_X, IM2.IP1, Group Pun, inproceedings}
}

@article{pop00010,
title = {Finding Information in Multimedia Records of Meetings},
author = {Popescu-Belis, Andrei and Lalanne, Denis and Bourlard, Hervé},
journal = {Multimedia, IEEE},
year = {2011},
publisher = {ieeexplore.ieee.org},
keywords = {report_X, IM2.IP2, Group Ingold, article}
}

@inproceedings{SchwallerLK10,
title = {PyGmI: creation and evaluation of a portable gestural interface},
author = {Schwaller, Matthias and Lalanne, Denis and Khaled, Omar Abou},
booktitle = {NordiCHI},
year = {2010},
pages = {773--776},
keywords = {report_X, IM2.IP2, Group Ingold, inproceedings}
}

@inproceedings{CarrinoTMKI11,
title = {Head-Computer Interface: A Multimodal Approach to Navigate through Real and Virtual Worlds},
author = {Carrino, Francesco and Tscherrig, Julien and Mugellini, Elena and Khaled, Omar Abou and Ingold, Rolf},
booktitle = {HCI (2)},
year = {2011},
pages = {222--230},
keywords = {report_X, IM2.IP2, Group Ingold, inprocedings}
}

@inproceedings{CarrinoMKI11,
title = {ARAMIS: Toward a Hybrid Approach for Human- Environment Interaction},
author = {Carrino, Stefano and Mugellini, Elena and Khaled, Omar Abou and Ingold, Rolf},
booktitle = {HCI (3)},
year = {2011},
pages = {165--174},
keywords = {report_X, IM2.IP2, Group Ingold, inproceedings}
}

@article{BoyandinBBL11,
title = {Flowstrates: An Approach for Visual Exploration of Temporal Origin-Destination Data},
author = {Boyandin, Ilya and Bertini, Enrico and Bak, Peter and Lalanne, Denis},
journal = {Computer Graphics Forum},
year = {2011},
volume = {30},
number = {3},
pages = {971--980},
keywords = {report_X, IM2.IP2, Group Ingold, article}
}

@article{BrueggerLLH10,
title = {Enriching the Design and Prototyping Loop: a Set of Tools to Support the Creation of Activity-Based Pervasive Applications},
author = {Bruegger, Pascal and Lisowska, Agnes and Lalanne, Denis and Hirsbrunner, Beat},
journal = {Journal of Mobile Multimedia},
year = {2010},
volume = {6},
number = {4},
pages = {339--360},
keywords = {report_X, IM2.IP2, Group Ingold, article}
}

@article{DumasJMUI10,
title = {Description languages for multimodal interaction: a set of guidelines and its illustration with SMUIML},
author = {Dumas, Bruno and Lalanne, Denis and Ingold, Rolf},
journal = {Journal on Multimodal User Interfaces},
year = {2010},
volume = {3},
number = {3},
pages = {237--247},
publisher = {Springer Berlin / Heidelberg},
keywords = {report_X, IM2.IP2, Group Ingold, article}
}

@inproceedings{EvequozIHM2010,
title = {Gérer son information personnelle au moyen de la navigation par facettes},
author = {Evéquoz, Florian and Thomet, Julien and Lalanne, Denis},
booktitle = {Conference Internationale Francophone sur I'Interaction Homme-Machine},
series = {IHM '10},
year = {2010},
pages = {41--48},
publisher = {ACM},
keywords = {report_X, IM2.IP2, Group Ingold, inproceedings}
}

@article{MekhaldiJMTAP11,
title = {A Multimodal Alignment Framework for Spoken Documents},
author = {Mekhaldi, Dalila and Lalanne, Denis and Ingold, Rolf},
journal = {International Journal of Multimedia Tools and Applications},
year = {2011},
keywords = {report_X, IM2.IP1, Group Ingold, article}
}

@inproceedings{LalanneM11,
title = {A Fitt of distraction: measuring the impact of distracters and multi-users on pointing efficiency},
author = {Lalanne, Denis and Masson, Agnes Lisowska},
booktitle = {CHI Extended Abstracts},
year = {2011},
pages = {2125--2130},
keywords = {report_X, IM2.IP1, Group Ingold, inproceedings}
}

@article{MekhaldiJMPT10,
title = {Multimodal Document Alignment: Feature-based Validation to Strengthen Thematic Links},
author = {Mekhaldi, Dalila and Lalanne, Denis},
journal = {Journal of Multimedia Processing Technologies},
year = {2010},
volume = {1},
number = {1},
pages = {30--46},
keywords = {report_X, IM2.IP1, Group Ingold, article}
}

@inproceedings{VerdetMBH10,
title = {Channel detectors for system fusion in the context of NIST LRE 2009},
author = {Verdet, Florian and Matrouf, Driss and Bonastre, Jean-Fran\c{c}ois and Hennebert, Jean},
booktitle = {INTERSPEECH},
year = {2010},
pages = {733--736},
keywords = {report_X, IM2.IP1, Group Ingold, inproceedings}
}

@inproceedings{HadjarI10,
title = {Improving XED for extracting content from Arabic PDFs},
author = {Hadjar, Karim and Ingold, Rolf},
booktitle = {Document Analysis Systems},
year = {2010},
pages = {371--376},
keywords = {report_X, IM2.IP1, Group Ingold, inproceedings}
}

@proceedings{doceng2010,
title = {Proceedings of the 2010 ACM Symposium on Document Engineering, Manchester, United Kingdom, September 21-24, 2010},
editor = {Antonacopoulos, Apostolos and Gormish, Michael J.  and Ingold, Rolf},
booktitle = {ACM Symposium on Document Engineering},
year = {2010},
publisher = {ACM},
keywords = {report_X, IM2.IP1, Group Ingold, inproceedings}
}

@article{ivanov11_ieeespm,
title = {In tags we trust: Trust modeling in social tagging of multimedia content},
author = {Ivanov, I.  and Vajda, P.  and Lee, J. -S.  and Ebrahimi, T. },
journal = {IEEE Signal Processing Magazine},
year = {2011},
keywords = {report_X, IM2.IP3, Group Ebrahimi, article}
}

@article{koelstra11_ieeetac,
title = {{DEAP}: a database for emotion analysis using physiological signals},
author = {Koelstra, S.  and Muehl, C.  and Soleymani, M.  and Lee, J. -S.  and Yazdani, A.  and Ebrahimi, T.  and Pun, T.  and Nijholt, A.  and Patras, I. },
journal = {IEEE Trans. Affective Computing},
year = {2011},
keywords = {report_X, IM2.IP3, Group Ebrahimi, article}
}

@inproceedings{DeSimone2011b,
title = {Performance analysis of VP8 image and video compression based on subjective evaluations},
author = {DeSimone, F.  and Goldmann, L.  and Lee, J. S.  and Ebrahimi, T. },
booktitle = {SPIE Optics and Photonics, Applications of Digital Image Processing XXXIV, 8135},
year = {2011},
keywords = {report_X, IM2.IP1, Group Ebrahimi, inproceedings},
owner = {francesca},
timestamp = {2011.09.08}
}

@article{DeSimone2011,
title = {Towards high efficiency video coding: subjective evaluation of potential coding technologies},
author = {DeSimone, F.  and Goldmann, L.  and Lee, J. -S.  and Ebrahimi, T. },
journal = {Journal of Visual Communication and Image Representation},
year = {2011},
keywords = {report_X, IM2.IP1, Group Ebrahimi, article}
}

@article{DeSimone2011a,
title = {Subjective quality assessment of H.264/AVC video streaming with packet losses},
author = {DeSimone, F.  and Naccari, M.  and M.Tagliasacchi  and Dufaux, F.  and Tubaro, S.  and Ebrahimi, T. },
journal = {Eurasip Journal on Image and Video Processing, 2011 Article ID 190431},
year = {2011},
keywords = {report_X, IM2.IP1, Group Ebrahimi, article},
owner = {francesca},
timestamp = {2011.09.08}
}

@inproceedings{lee11_icassp,
title = {Audio-visual synchronization recovery in multimedia content},
author = {Lee, J. -S.  and Ebrahimi, T. },
booktitle = {Proc. International Conference on Acoustics, Speech and Signal Processing (ICASSP'11)},
year = {2011},
month = {May},
pages = {2280--2283},
keywords = {report_X, IM2.IP1, Group Ebrahimi, inproceedings}
}

@article{lee11_ieeetmm,
title = {Subjective quality evaluation via paired comparison: application to scalable video coding},
author = {Lee, J. -S.  and Simone, F. De and Ebrahimi, T. },
journal = {IEEE Transactions on Multimedia},
year = {2011},
keywords = {report_X, IM2.IP1, Group Ebrahimi, article}
}

@article{lee11_ieeejstsp,
title = {Subjective quality evaluation of foveated video coding using audio-visual focus of attention},
author = {Lee, J. -S.  and Simone, F. De and Ebrahimi, T. },
journal = {IEEE Journal of Selected Topics in Signal Processing},
year = {2011},
keywords = {report_X, IM2.IP1, Group Ebrahimi, article}
}

@inproceedings{lee11_qomex,
title = {Subjective quality assessment of scalable video coding},
author = {Lee, J. -S.  and Simone, F. De and Ebrahimi, T. },
booktitle = {Proc. International Workshop on Quality of Multimedia Experience (QoMEX'11)},
year = {2011},
keywords = {report_X, IM2.IP1, Group Ebrahimi, inproceedings}
}

@article{lee10_jvci,
title = {Video coding based on audio-visual focus of attention},
author = {Lee, J. -S.  and Simone, F. De and Ebrahimi, T. },
journal = {Journal of Visual Communication and Image Representation},
year = {2010},
keywords = {report_X, IM2.IP1, Group Ebrahimi, article}
}

@inproceedings{vajda11_dspe,
title = {Omnidirectional object duplicate detection},
author = {Vajda, P.  and Ivanov, I.  and Goldmann, L.  and Ebrahimi, T. },
booktitle = {Proc. International Workshop on Digital Signal Processing (DSPE'11)},
year = {2011},
pages = {332--337},
keywords = {report_X, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{vajda11_icme,
title = {Social game {E}pitome vesus automatic visual analysis},
author = {Vajda, P.  and Ivanov, I.  and Goldmann, L.  and Ebrahimi, T. },
booktitle = {Proc. International Conference on Multimedia and Expo (ICME'11)},
year = {2011},
keywords = {report_X, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{vajda11_icmedemo,
title = {Let {E}pitome summarize your photo collection!},
author = {Vajda, P.  and Ivanov, I.  and Goldmann, L.  and Ebrahimi, T. },
booktitle = {Proc. International Conference on Multimedia and Expo (ICME'11)},
year = {2011},
keywords = {report_X, IM2.IP1, Group Ebrahimi, inproceedings}
}

@inproceedings{dillenbourg001,
title = {Classroom orchestration: The third circle of usability.},
author = {Dillenbourg, P.  and Zufferey, G.  and Alavi, H. S.  and Jermann, P.  and Do Lenh, S.  and Bonnard, Q.  and Cuendet, S.  and Kaplan, F. },
booktitle = {Proceedings of the 9th Computer-Supported Collaborative Learning Conference, Hong Kong},
year = {2011},
keywords = {report_X, IM2.IP2, Group Dillenbourg, inproceedings}
}

@unpublished{dillenbourg002,
title = {A Tabletop Environment for Augmenting Meetings with Background Search},
author = {Li, N.  and Mubin, O.  and Kaplan, F.  and Dilllenbourg, P. },
year = {\bibnodate},
note = {Under peer review for the ITS2011 conference, Kobe, Japan},
keywords = {report_X, IM2.IP2, Group Dillenbourg, unpublished}
}

@article{Valente_SPEECHCOM_2010,
title = {Multi-Stream Speech Recognition based on Dempster-Shafer Combination Rule},
author = {Valente, Fabio},
journal = {Speech Communication},
year = {2010},
volume = {52},
number = {3},
keywords = {report_X, IM2.IP1, Group Bourlard, article}
}

@article{Saheer_TASLP_2011,
title = {Vocal Tract Length Normalization for Statistical Parametric Speech Synthesis},
author = {Saheer, Lakshmi and Dines, John and Garner, Philip N. },
journal = {IEEE transactions on audio, speech and langugae processing},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, article}
}

@article{Valente_SPEECHCOM-2_2010,
title = {Hierarchical and Parallel Processing of Auditory and Modulation Frequencies for Automatic Speech Recognition},
author = {Valente, Fabio},
journal = {Speech Communication},
year = {2010},
volume = {52},
number = {10},
keywords = {report_X, IM2.IP1, Group Bourlard, article}
}

@article{Chen_TVCG_2011,
title = {Learning a 3D Human Pose Distance Metric from Geometric Pose Descriptor},
author = {Chen, Cheng},
journal = {IEEE Transactions on Visualization and Computer Graphics},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, article}
}

@article{Berclaz_TPAMI_2011,
title = {Multiple Object Tracking using K-Shortest Paths Optimization},
author = {Berclaz, Jerome and Turetken, Engin and Fleuret, Francois and Fua, Pascal},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, article}
}

@article{Chen_CVIU_2011,
title = {3D human pose recovery from image by efficient visual feature selection},
author = {Chen, Cheng and Yang, Yi and Nie, Feiping and Odobez, Jean-Marc},
journal = {Computer Vision and Image Understanding},
year = {2011},
volume = {115},
number = {3},
keywords = {report_X, IM2.IP1, Group Bourlard, article}
}

@article{Do_SPEECHCOMMUNICATION_2011,
title = {A novel framework for noise robust ASR using cochlear implant-like spectrally reduced speech},
author = {Do, Cong-Thanh and Pastor, Dominique and Goalic, André},
journal = {Speech Communication},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, article}
}

@article{Ali_TPAMI_2011,
title = {A real-time deformable detector.},
author = {Ali, Karim and Fleuret, Francois and Hasler, David and Fua, Pascal},
journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, article}
}

@article{Varadarajan_IEEETRANSCSVT_2011,
title = {A Sequential Topic Model for Mining Recurrent Activities from Video and Audio Data Logs},
journal = {IEEE TRANSACTIONS ON CIRCUITS AND SYSTEMS FOR VIDEO TECHNOLOGY},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, article}
}

@article{Popescu-Belis_CSL_2011,
title = {Automatic Identification of Discourse Markers in Multiparty Dialogues: An In-Depth Study of Like and Well},
author = {Popescu-Belis, Andrei and Zufferey, Sandrine},
journal = {Computer Speech and Language},
year = {2011},
volume = {25},
number = {3},
pages = {499--518},
keywords = {report_X, IM2.IP1, Group Bourlard, article}
}

@article{Garner_SPECOM_2011,
title = {Cepstral normalisation and the signal to noise ratio spectrum in automatic speech recognition},
author = {Garner, Philip N. },
journal = {Speech Communication},
year = {2011},
volume = {53},
number = {8},
pages = {991--1001},
keywords = {report_X, IM2.IP1, Group Bourlard, article}
}

@article{Sanchez-Cortes_IEEETMM_2011,
title = {Detecting Emergent Leaders in Small Groups using Nonverbal Behavior},
author = {Sanchez-Cortes, Dairazalia and Aran, Oya and Schmid Mast, Marianne and Gatica-Perez, Daniel},
editor = {Chen, Sheng-Wei},
journal = {IEEE Transactions on Multimedia},
year = {2011},
keywords = {report_X, IM2.IP3, Group Bourlard, article}
}

@article{Jayagopi_MULTIMEDIASYSTEMS_2011,
title = {Privacy-sensitive recognition of group conversational context with sociometers},
author = {Jayagopi, Dinesh Babu and Kim, Taemie and Pentland, Alex and Gatica-Perez, Daniel},
journal = {Springer Multimedia Systems Journal},
year = {2011},
keywords = {report_X, IM2.IP3, Group Bourlard, article}
}

@article{Vinciarelli_IEEESPM_2010,
title = {www.sspnet.eu: A Web Portal for Social Signal Processing},
author = {Vinciarelli, Alessandro and Pantic, Maja},
journal = {IEEE Signal Processing Magazine},
year = {2010},
volume = {27},
number = {4},
pages = {142--144},
keywords = {report_X, IM2.IP3, Group Bourlard, article},
projects = {SSPNet}
}

@book{Vinciarelli_SPRINGER_2010,
title = {Human Behavior Understanding},
author = {Vinciarelli, Alessandro},
editor = {Salah, Albert Ali and Gevers, Theo and Sebe, Nicu and Vinciarelli, Alessandro},
year = {2010},
publisher = {Springer Verlag},
keywords = {report_X, IM2.IP3, Group Bourlard, book}
}

@incollection{Biel_SPRINGER_2001,
title = {Call me Guru: user categories and large-scale behavior in YouTube},
author = {Biel, Joan-Isaac and Gatica-Perez, Daniel},
booktitle = {Social Media Computing},
year = {2011},
publisher = {Springer},
keywords = {report_X, IM2.IP3, Group Bourlard, incollection}
}

@inproceedings{Rasipuram_ICANN2011_2011,
title = {Improving Articulatory Feature and Phoneme Recognition using Multitask Learning},
author = {Rasipuram, Ramya and Magimai.-Doss, Mathew},
booktitle = {Artificial Neural Networks and Machine Learning - ICANN 2011},
year = {2011},
pages = {299--306},
publisher = {Springer Berlin / Heidelberg},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Imseng_INTERSPEECH_2011,
title = {Improving non-native ASR through stochastic multilingual phoneme space transformations},
author = {Imseng, David and Bourlard, Hervé and Dines, John and Garner, Philip N.  and Magimai.-Doss, Mathew},
booktitle = {Proceedings of Interspeech},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Rasipuram_ICASSP_2011,
title = {Integrating articulatory features using Kullback-Leibler divergence based acoustic model for phoneme recognition},
author = {Rasipuram, Ramya and Magimai.-Doss, Mathew},
booktitle = {Proceedings IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP},
year = {2011},
pages = {5192--5195},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Scheffler_BMVC_2011,
title = {Joint Adaptive Colour Modelling and Skin, Hair and Clothing Segmentation Using Coherent Probabilistic Index Maps},
author = {Scheffler, Carl and Odobez, Jean-Marc},
booktitle = {British Machine Vision Conference},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Imseng_ICASSP_2011,
title = {Language dependent universal phoneme posterior estimation for mixed language speech recognition},
author = {Imseng, David and Bourlard, Hervé and Magimai.-Doss, Mathew and Dines, John},
booktitle = {Proceedings IEEE International Conference on Acoustics, Speech and Signal Processing},
year = {2011},
pages = {5012--5015},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Parthasarathi_INTERSPEECH_2011,
title = {LP Residual Features for Robust, Privacy-Sensitive Speaker Diarization},
author = {Parthasarathi, Sree Hari Krishnan and Bourlard, Hervé and Gatica-Perez, Daniel},
booktitle = {Interspeech},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Asaei_ICASSP_2011,
title = {Model-based Compressive Sensing for Multi-party Distant Speech Recognition},
author = {Asaei, Afsaneh and Bourlard, Hervé and Cevher, Volkan},
booktitle = {2011 IEEE International Conference on Acoustics, Speech and Signal Processing},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Asaei_INTERSPEECH_2011,
title = {Multi-party Speech Recovery Exploiting Structured Sparsity Models},
author = {Asaei, Afsaneh and Taghizadeh, Mohammad J.  and Bourlard, Hervé and Cevher, Volkan},
booktitle = {Proceedings of International Speech Communication Association, INTERSPEECH},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Roy_ICASSP11_2011,
title = {Phoneme Recognition using Boosted Binary Features},
author = {Roy, Anindya and Magimai.-Doss, Mathew and Marcel, Sébastien},
booktitle = {IEEE Intl. Conference on Acoustics, Speech and Signal Processing 2011},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Liang_INTERSPEECH_2011,
title = {Phonological Knowledge Guided HMM State Mapping for Cross-Lingual Speaker Adaptation},
author = {Liang, Hui and Dines, John},
booktitle = {Proceedings of Interspeech},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Soldo_ICASSP_2011,
title = {Posterior Features for Template-based ASR},
author = {Soldo, Serena and Magimai.-Doss, Mathew and Pinto, Joel Praveen and Bourlard, Hervé},
booktitle = {Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{BenShitrit_ICCV_2011,
title = {Tracking Multiple Objects under Global Appearance Constraints},
author = {Ben Shitrit, Horesh and Berclaz, Jerome and Fleuret, Francois and Fua, Pascal},
booktitle = {Proceedings of the IEEE International Conference on Computer Vision},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Popescu-Belis_TEXTGRAPH_2011,
title = {Using a Wikipedia-based Semantic Relatedness Measure for Document Clustering.},
author = {Yazdani, Majid and Popescu-Belis, Andrei},
booktitle = {Graph-based Methods for Natural Language Processing},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Valente_INTERSPEECH2010_2010,
title = {A Comparative Study of MLP Front-ends for Mandarin ASR},
author = {Valente, Fabio and Magimai.-Doss, Mathew and Plahl, Christian and Suman, Ravuri and Wen, Wang},
booktitle = {Proceedings of Interspeech, Japan},
year = {2010},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Varadarajan_NIPS_2010,
title = {A Sparsity Constraint for Topic Models - Application to Temporal Activity Mining},
booktitle = {NIPS-2010 Workshop on Practical Applications of Sparse Modeling: Open Issues and New Directions},
year = {2010},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Kurimo_ACL_2010,
title = {Personalising speech-to-speech translation in the EMIME project},
author = {Kurimo, Mikko and Byrne, William and Dines, John and Garner, Philip N.  and Gibson, Matthew and Guan, Yong and Hirsim{ü{a}}ki, Teemu and Karhila, Reima and King, Simon and Liang, Hui and Oura, Keiichiro and Saheer, Lakshmi and Shannon, Matt and Shiota, Sayaka and Tian, Jilei and Tokuda, Keiichi and Wester, Mirjam and Wu, Yi-Jian and Yamagishi, Junichi},
booktitle = {Proceedings of the ACL 2010 System Demonstrations},
year = {2010},
organization = {Association for Computational Linguistics},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Lefevre_VISAPP_2010,
title = {View-Based Appearance Model Online Learning for 3D Deformable Face Tracking},
author = {Lef{è}vre, Stéphanie and Odobez, Jean-Marc},
booktitle = {Proc. Int. Conf. on Computer Vision Theory and Applications},
year = {2010},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Suditu_ICCV_2011,
title = {HEAT: Iterative Relevance Feedback with One Million Images},
author = {Suditu, Nicolae and Fleuret, Francois},
booktitle = {International Conference on Computer Vision},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Emonet_CVPR_2011,
title = {Extracting and Locating Temporal Motifs in Video Scenes Using a Hierarchical Non Parametric Bayesian Model},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Ali_CVPR_2011,
title = {FlowBoost - Appearance Learning from Sparsely Annotated Video},
author = {Ali, Karim and Hasler, David and Fleuret, Francois},
booktitle = {Proceedings of the IEEE international conference on Computer Vision and Pattern Recognition},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Magimai.-Doss_INTERSPEECH2011_2011,
title = {GRAPHEME-BASED AUTOMATIC SPEECH RECOGNITION USING KL-HMM},
author = {Magimai.-Doss, Mathew and Rasipuram, Ramya and Aradilla, Guillermo and Bourlard, Hervé},
booktitle = {Proceedings of Interspeech},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Wester_INTERSPEECH_2011,
title = {Cross-Lingual Speaker Discrimination Using Natural and Synthetic Speech},
author = {Wester, Mirjam and Liang, Hui},
booktitle = {Proceedings of Interspeech},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Duffner_FG_2011,
title = {Exploiting Long-Term Observations for Track Creation and Deletion in Online Multi-Face Tracking},
author = {Duffner, Stefan and Odobez, Jean-Marc},
booktitle = {IEEE Conference on Automatic Face and Gesture Recognition},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Chen_AVSS_2011,
title = {Combined Estimation of Location and Body Pose in Surveillance Video},
author = {Chen, Cheng and Heili, Alexandre and Odobez, Jean-Marc},
booktitle = {AVSS},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Taghizadeh_HSCMA_2011,
title = {An Integrated Framework for Multi-Channel Multi-Source Localization and Voice Activity Detection},
author = {Taghizadeh, Mohammad J.  and Garner, Philip N.  and Bourlard, Hervé and Abutalebi, Hamid Reza and Asaei, Afsaneh},
booktitle = {The Third Joint Workshop on Hands-free Speech Communication and Microphone Arrays},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Ozcan_BMVC11_2011,
title = {A Large-Scale Database of Images and Captions for Automatic Face Naming},
author = {Ozcan, Mert and Luo, Jie and Ferrari, Vittorio and Caputo, Barbara},
booktitle = {Proceedings of the 22nd British Machine Vision Conference},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{BanitalebiDehkordi_AISP_2011,
title = {A Compressive Sensing Based Compressed Neural Network for Sound Source Localization},
author = {Banitalebi Dehkordi, Mehdi and Abutalebi, Hamid Reza and Ghanei, Hossein},
booktitle = {Proceedings of International Symposium on Artificial Intelligence and Signal Processing},
year = {2011},
keywords = {report_X, IM2.IP1, Group Bourlard, inproceedings}
}

@inproceedings{Popescu-Belis_SIGDIAL2011_2011,
title = {A Just-in-Time Document Retrieval System for Dialogues or Monologues},
author = {Popescu-Belis, Andrei and Yazdani, Majid and Nanchen, Alexandre and Garner, Philip N. },
booktitle = {SIGDIAL 2011 (12th annual SIGDIAL Meeting on Discourse and Dialogue), Demonstration Session},
year = {2011},
pages = {350--352},
keywords = {report_X, IM2.IP2, Group Bourlard, inproceedings}
}

@inproceedings{Popescu-Belis_SIGIR2010_2010,
title = {Automatic Content Linking: Speech-based Just-in-time Retrieval for Multimedia Archives},
author = {Popescu-Belis, Andrei and Kilgour, Jonathan and Poller, Peter and Nanchen, Alexandre and Boertjes, Erik and de Wit, Joost},
booktitle = {Proceedings of the 33rd Annual ACM SIGIR Conference},
year = {2010},
pages = {703},
keywords = {report_X, IM2.IP2, Group Bourlard, inproceedings},
Switzerland}
}

@inproceedings{Mohammadi_ACMMM,
title = {Automatic Attribution of Personality Traits Based on Prosodic Features},
author = {Mohammadi, Gelareh and Vinciarelli, Alessandro},
booktitle = {Proceedings of ACM Multimedia 2011 workshop},
year = {2011},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Do_MDM_2011,
title = {Contextual grouping: discovering real-life interaction types from longitudinal Bluetooth data},
author = {Do, Trinh-Minh-Tri and Gatica-Perez, Daniel},
booktitle = {12th International Conference on Mobile Data Management},
year = {2011},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Chittaranjan_FG11_2011,
title = {Exploiting observers' judgements for nonverbal group interaction analysis},
author = {Chittaranjan, Gokul and Aran, Oya and Gatica-Perez, Daniel},
booktitle = {IEEE Conference on Automatic Face and Gesture Recognition},
year = {2011},
pages = {6},
publisher = {IEEE},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Do_ISWC_2011,
title = {GroupUs: Smartphone Proximity Data and Human Interaction Type Mining},
author = {Do, Trinh-Minh-Tri and Gatica-Perez, Daniel},
booktitle = {15th annual International Symposium on Wearable Computers},
year = {2011},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Mohammadi_IEEESMC2011_2011,
title = {Humans as Feature Extractors: Combining Prosody and Personality Perception for Better Speaking Style Recognition},
author = {Mohammadi, Gelareh and Vinciarelli, Alessandro},
booktitle = {Proceeding of IEEE Int Conference on Systems, Man, and Cybernetics - Special Sessions},
year = {2011},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Chittaranjan_NIPS-HCD2011_2011,
title = {Inferring truth from multiple annotators for social interaction analysis},
author = {Chittaranjan, Gokul and Aran, Oya and Gatica-Perez, Daniel},
booktitle = {Neural Information Processing Systems (NIPS) Workshop on Modeling Human Communication Dynamics (HCD)},
year = {2011},
pages = {4},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Madan_PERVASIVE_2011,
title = {Pervasive Sensing to Model Political Opinions in Face-to-Face Networks},
author = {Madan, Anmol and Farrahi, Katayoun and Gatica-Perez, Daniel and Pentland, Alex},
booktitle = {Pervasive},
year = {2011},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Korchagin_ICME_2011,
title = {Social Focus of Attention as a Time Function Derived from Multimodal Signals},
author = {Korchagin, Danil and Abutalebi, Hamid Reza},
booktitle = {Proceedings IEEE International Conference on Multimedia & Expo},
year = {2011},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Chittaranjan_ISWC11_2011,
title = {Who's Who with Big-Five: Analyzing and Classifying Personality Traits with Smartphones},
author = {Chittaranjan, Gokul and Blom, J.  and Gatica-Perez, Daniel},
booktitle = {International Symposium on Wearable Computing},
year = {2011},
pages = {8},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Biel_ICWSMa,
title = {You Are Known by How You Vlog: Personality Impressions and Nonverbal Behavior in YouTube},
author = {Biel, Joan-Isaac and Aran, Oya and Gatica-Perez, Daniel},
booktitle = {Proceedings of AAAI International Conference on Weblogs and Social Media},
year = {2011},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Do_MUM2010_2010,
title = {By their apps you shall understand them: mining large-scale patterns of mobile phone usage},
author = {Do, Trinh-Minh-Tri and Gatica-Perez, Daniel},
booktitle = {The 9th International Conference on Mobile and Ubiquitous Multimedia},
year = {2010},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Montoliu_MUM2010_2010,
title = {Discovering Human Places of Interest from Multimodal Mobile Phone Data},
author = {Montoliu, Raul.  and Gatica-Perez, Daniel},
booktitle = {Proceedings of 9th International Conference on on Mobile and Ubiquitous Multimedia},
year = {2010},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Sanchez-Cortes_ICMI-MLMI2010_2010,
title = {Identifying Emergent Leadership in Small Groups using Nonverbal Communicative Cues},
author = {Sanchez-Cortes, Dairazalia and Aran, Oya and Schmid Mast, Marianne and Gatica-Perez, Daniel},
booktitle = {Proc. ICMI-MLMI '10 International Conference on Multimodal Interfaces and the Workshop on Machine Learning for Multimodal Interaction},
year = {2010},
publisher = {ACM New York, NY, USA {\textcopyright}2010},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Valente_SSPW2010_2010,
title = {Improving Speech Processing trough Social Signals: Automatic Speaker Segmentation of Political Debates using Role based Turn-Taking Patterns.},
author = {Valente, Fabio and Vinciarelli, Alessandro},
booktitle = {Proceedings of ACM Multimedia Workshop on Social Signal Processing},
year = {2010},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Farrahi_SOCIALCOM-2_2010,
title = {Mining Human Location-Routines Using a Multi-Level Approach to Topic Modeling},
author = {Farrahi, Katayoun and Gatica-Perez, Daniel},
booktitle = {2010 IEEE Second International Conference on Social Computing, SIN Symposium},
year = {2010},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Vinciarelli_IWMHCI_2010,
title = {Mobile Social Signal Processing: vision and research issues},
author = {Vinciarelli, Alessandro and Murray-Smith, Roderick and Bourlard, Hervé},
booktitle = {Proceedings of the International Workshop on Mobile HCI},
year = {2010},
pages = {513--516},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Jayagopi_MUM2010_2010,
title = {Recognizing conversational context in group interaction using privacy-sensitive mobile sensors},
author = {Jayagopi, Dinesh Babu and Kim, Taemie and Pentland, Alex and Gatica-Perez, Daniel},
booktitle = {Proceedings of International Conference on Mobile and Ubiquitous Multimedia, Limassol, Cyprus},
year = {2010},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Vinciarelli_MEASURINGBEHAVIOR_2010,
title = {Social Signal Processing: Understanding Nonverbal Communication in Social Interactions},
author = {Vinciarelli, Alessandro and Valente, Fabio},
booktitle = {Proceedings of Measuring Behavior 2010, Eindhoven (The Netherlands)},
year = {2010},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Murino_SISM_2010,
title = {Socially Intelligent Surveillance and Monitoring: Analysing Social Dimensions of Physical Space},
author = {Murino, V and Cristani, M and Vinciarelli, Alessandro},
booktitle = {Proceedings of International Workshop on Socially Intelligent Surveillance and Monitoring},
year = {2010},
pages = {51--58},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Mohammadi_ACM_2010,
title = {The Voice of Personality: Mapping Nonverbal Vocal Behavior into Trait Attributions},
author = {Mohammadi, Gelareh and Vinciarelli, Alessandro and Mortillaro, Marcello},
booktitle = {Proceedings of ACM Multimedia Workshop on Social Signal Processing},
year = {2010},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@inproceedings{Biel_ICMI-MLMI_2010,
title = {Vlogcast Yourself: Nonverbal Behavior and Attention in Social Media},
author = {Biel, Joan-Isaac and Gatica-Perez, Daniel},
booktitle = {Proceedings International Conference on Multimodal Interfaces (ICMI-MLMI)},
year = {2010},
keywords = {report_X, IM2.IP3, Group Bourlard, inproceedings}
}

@techreport{Skoumas_Idiap-RR-23-2011,
title = {Intuitive Recipes for Uncertainty Decoding with SNR Features for Noise Robust ASR},
author = {Skoumas, Georgios and Garner, Philip N. },
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-23-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
projects = {Idiap},
}

@techreport{Asaei_Idiap-RR-22-2011,
title = {Multi-party Speech Recovery Exploiting Structured Sparsity Models},
author = {Asaei, Afsaneh and Taghizadeh, Mohammad J.  and Bourlard, Hervé and Cevher, Volkan},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-22-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {We study the sparsity of spectro-temporal representation of speech in reverberant acoustic conditions. This study motivates the use of structured sparsity models for efficient recovery of speech. We formulate the underdetermined convolutive speech separation in spectro-temporal domain as the sparse signal recovery where we leverage model-based recovery algorithms. To tackle the ambiguity of the real acoustics, we exploit the Image Model of the enclosures to estimate the room impulse response function through a structured sparsity constraint optimization. The experiments conducted on real data recordings demonstrate the effectiveness of the proposed approach for multi-party speech applications.},
projects = {Idiap}
}

@techreport{Rasipuram_Idiap-RR-21-2011,
title = {MULTITASK LEARNING TO IMPROVE ARTICULATORY FEATURE ESTIMATION AND PHONEME RECOGNITION},
author = {Rasipuram, Ramya and Magimai.-Doss, Mathew},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-21-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {Speech sounds can be characterized by articulatory features. Articulatory features are typically estimated using a set of multilayer perceptrons (MLPs), i.e., a separate MLP is trained for each articulatory feature. In this report, we investigate multitask learning (MTL) approach for joint estimation of articulatory features with and without phoneme classification as subtask. The effect of number of subtasks in MTL is studied by selecting two different articulatory feature representations. Our studies show that MTL MLP can estimate articulatory features compactly and efficiently by learning the inter-feature dependencies through a common hidden layer representation, irrespective of number of subtasks. Furthermore, adding phoneme as subtask while estimating articulatory features improves both articulatory feature estimation and phoneme recognition. On TIMIT phoneme recognition task, articulatory feature posterior probabilities obtained by MTL MLP achieve a phoneme recognition accuracy of 73.8\%, while the phoneme posterior probabilities achieve an accuracy of 74.2\%.},
}

@techreport{Korchagin_Idiap-RR-20-2011,
title = {Impact of Excitation Frequency on Short-Term Recording Synchronisation and Confidence Estimation},
author = {Korchagin, Danil},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-20-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {In this paper, we present the results of a study on excitation frequency impact on short-term recording synchronisation and confidence estimation for multisource audiovisual data, recorded by different personal capturing devices during social events. The core of the algorithm is based on perceptual time-quefrency analysis with a precision of 10 ms. Performance levels achieved to date on 14  hours hand-labelled dataset have shown positive impact of excitation frequency on temporal synchronisation (98.19\% precision for 5 s recordings) and confidence estimation (99.08\% precision with 100\% recall for 5 s recordings). The results surpass the performance of fast cross correlation while keeping lower system requirements.},
projects = {Idiap,
TA2},
}

@techreport{Imseng_Idiap-RR-19-2011,
title = {Improving non-native ASR through stochastic multilingual phoneme space transformations},
author = {Imseng, David and Bourlard, Hervé and Dines, John and Garner, Philip N.  and Magimai.-Doss, Mathew},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-19-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {We propose a stochastic phoneme space transformation technique that allows the conversion of conditional source phoneme posterior probabilities (conditioned on the acoustics) into target phoneme posterior probabilities. The source and target phonemes can be in any language and phoneme format such as the International Phonetic Alphabet. The novel technique makes use of a Kullback-Leibler divergence based hidden Markov model and can be applied to non-native and accented speech recognition or used to adapt systems to under-resourced languages. In this paper, and in the context of hybrid HMM/MLP recognizers, we successfully apply the proposed approach to non-native English speech recognition on the HIWIRE dataset.},
projects = {Idiap,
IM2,
EMIME},
}

@techreport{Wester_Idiap-RR-18-2011,
title = {Cross-Lingual Speaker Discrimination Using Natural and Synthetic Speech},
author = {Wester, Mirjam and Liang, Hui},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-18-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {This paper describes speaker discrimination experiments in which native English listeners were presented with natural speech stimuli in English and Mandarin, synthetic speech stimuli in English and Mandarin, or natural Mandarin speech and synthetic English speech stimuli. In each experiment, listeners were asked to judge whether the sentences in a pair were spoken by the same person or not. We found that the results of Mandarin/English speaker discrimination were very similar to those found in previous work on German/English and Finnish/English speaker discrimination. We conclude from this and previous work that listeners are able to discriminate between speakers across languages or across speech types, but the combination of these two factors leads to a speaker discrimination task that is too difficult for listeners to perform successfully, given the fact that the quality of across-language speaker adapted speech synthesis at present still needs to be improved.},
projects = {EMIME},
}

@techreport{Liang_Idiap-RR-17-2011,
title = {Phonological Knowledge Guided HMM State Mapping for Cross-Lingual Speaker Adaptation},
author = {Liang, Hui and Dines, John},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-17-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {Within the HMM state mapping-based cross-lingual speaker adaptation framework, the minimum Kullback-Leibler divergence criterion has been typically employed to measure the similarity of two average voice state distributions from two respective languages for state mapping construction. Considering that this simple criterion doesn't take any language-specific information into account, we propose a data-driven, phonological knowledge guided approach to strengthen the mapping construction -- state distributions from the two languages are clustered according to broad phonetic categories using decision trees and mapping rules are constructed only within each of the clusters. Objective evaluation of our proposed approach demonstrates reduction of mel-cepstral distortion and that mapping rules derived from a single training speaker generalize to other speakers, with subtle improvement being detected during subjective listening tests.},
projects = {EMIME},
}

@techreport{Taghizadeh_Idiap-RR-16-2011,
title = {AN INTEGRATED FRAMEWORK FOR MULTI-CHANNEL MULTI-SOURCE LOCALIZATION AND VOICE ACTIVITY DETECTION},
author = {Taghizadeh, Mohammad J.  and Garner, Philip N.  and Bourlard, Hervé and Abutalebi, Hamid Reza and Asaei, Afsaneh},
booktitle = {IEEE Workshop on Hands-free Speech Communication and Microphone Arrays},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-16-2011},
institution = {Idiap},
location = {30 May � 1 June 2011, Edinburgh, Scotland},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {Two of the major challenges in microphone array based adap- tive beamforming, speech enhancement and distant speech recognition, are robust and accurate source localization and voice activity detection. This paper introduces a spatial gra- dient steered response power using the phase transform (SRP- PHAT) method which is capable of localization of competing speakers in overlapping conditions. We further investigate the behavior of the SRP function and characterize theoretically a fixed point in its search space for the diffuse noise field. We call this fixed point the null position in the SRP search space. Building on this evidence, we propose a technique for multi- channel voice activity detection (MVAD) based on detection of a maximum power corresponding to the null position. The gradient SRP-PHAT in tandem with the MVAD form an inte- grated framework of multi-source localization and voice ac- tivity detection. The experiments carried out on real data recordings show that this framework is very effective in prac- tical applications of hands-free communication.},
projects = {Idiap,
IM2},
}

@techreport{Garner_Idiap-RR-15-2011,
title = {Cepstral normalisation and the signal to noise ratio spectrum in automatic speech recognition.},
author = {Garner, Philip N. },
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-15-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {Cepstral normalisation in automatic speech recognition is investigated in the context of robustness to additive noise. It is argued that such normalisation leads naturally to a speech feature based on signal to noise ratio rather than absolute energy (or power). Explicit calculation of this {\em SNR-cepstrum} by means of a noise estimate is shown to have theoretical and practical advantages over the usual (energy based) cepstrum. The SNR-cepstrum is shown to be almost identical to the articulation index known in psycho-acoustics. Combination of the SNR-cepstrum with the well known perceptual linear prediction method is shown to be beneficial in noisy environments.},
projects = {IM2},
}

@techreport{Parthasarathi_Idiap-RR-14-2011,
title = {LP Residual Features for Robust, Privacy-Sensitive Speaker Diarization},
author = {Parthasarathi, Sree Hari Krishnan and Bourlard, Hervé and Gatica-Perez, Daniel},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-14-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {We present a comprehensive study of linear prediction residual for speaker diarization on single and multiple distant microphone conditions in privacy-sensitive settings, a requirement to analyze a wide range of spontaneous conversations. Two representations of the residual are compared, namely real-cepstrum and MFCC, with the latter performing better. Experiments on RT06eval show that residual with subband information from 2.5 kHz to 3.5 kHz and spectral slope yields a performance close to traditional MFCC features. As a way to objectively evaluate privacy in terms of linguistic information, we perform phoneme recognition. Residual features yield low phoneme accuracies compared to traditional MFCC features.},
projects = {Idiap,
SNSF-MULTI},
}

@techreport{Imseng_Idiap-RR-13-2011,
title = {Language dependent universal phoneme posterior estimation for mixed language speech recognition},
author = {Imseng, David and Bourlard, Hervé and Magimai.-Doss, Mathew and Dines, John},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-13-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {This paper presents a new approach to estimate "universal" phoneme posterior probabilities for mixed language speech recognition. More specifically, we propose a new theoretical framework to combine phoneme class posterior probabilities in a principled way by using (statistical) evidence about the language identity. We investigate the proposed approach in a mixed language environment (SpeechDat(II)) consisting of five European languages. Our studies show that the proposed approach can yield significant improvements on a mixed language task, while maintaining the performance on monolingual tasks. Additionally, through a case study, we also demonstrate the potential benefits of the proposed approach for non-native speech recognition.},
projects = {Idiap,
SNSF-MULTI,
IM2,
EMIME},
}

@techreport{Parthasarathi_Idiap-RR-12-2011,
title = {Privacy-Sensitive Audio Features for Speech/Nonspeech Detection},
author = {Parthasarathi, Sree Hari Krishnan and Gatica-Perez, Daniel and Bourlard, Hervé and Magimai.-Doss, Mathew},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-12-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {The goal of this paper is to investigate features for speech/nonspeech detection (SND) having minimal'' linguistic information from the speech signal. Towards this, we present a comprehensive study of privacy-sensitive features for SND in multiparty conversations. Our study investigates three different approaches to privacy-sensitive features. These approaches are based on: (a) simple, instantaneous feature extraction methods; (b) excitation source information based methods; and (c) feature obfuscation methods such as local (within 130 ms) temporal averaging and randomization applied on excitation source information. To evaluate these approaches for SND, we use multiparty conversational meeting data of nearly 450 hours. On this dataset, we evaluate these features and benchmark them against state-of-the-art spectral shape based features such as Mel-Frequency Perceptual Linear Prediction (MF-PLP). Fusion strategies combining excitation source with simple features show that state-of-the-art performance can be obtained in both close-talking and far-field microphone scenarios. As one way to quantify and evaluate the notion of privacy, we conduct Automatic Speech Recognition (ASR) studies on TIMIT. While excitation source features yield phoneme recognition accuracies in between the simple features and the MF-PLP features, obfuscation methods applied on the excitation features yield low phoneme accuracies in conjunction with state-of-the-art SND performance.},
projects = {Idiap,
SNSF-MULTI},
}

@techreport{Korchagin_Idiap-RR-10-2011,
title = {Just-in-Time Multimodal Association and Fusion from Home Entertainment},
author = {Korchagin, Danil and Motlicek, Petr and Duffner, Stefan and Bourlard, Hervé},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-10-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {In this paper, we describe a real-time multimodal analysis system with just-in-time multimodal association and fusion for a living room environment, where multiple people may enter, interact and leave the observable world with no constraints. It comprises detection and tracking of up to 4 faces, detection and localisation of verbal and paralinguistic events, their association and fusion. The system is designed to be used in open, unconstrained environments like in next generation video conferencing systems that automatically �orchestrate� the transmitted video streams to improve the overall experience of interaction between spatially separated families and friends. Performance levels achieved to date on hand-labelled dataset have shown sufficient reliability at the same time as fulfilling real-time processing requirements.},
projects = {Idiap,
TA2},
}

@techreport{Asaei_Idiap-RR-04-2011,
title = {Model-Based Compressive Sensing for Multi-Party Distant Speech Recognition},
author = {Asaei, Afsaneh and Bourlard, Hervé and Cevher, Volkan},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-04-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {We leverage the recent algorithmic advances in compressive sensing, and propose a novel source separation algorithm for efficient recovery of convolutive speech mixtures in spectro-temporal domain. Compared to the common sparse component analysis techniques, our approach fully exploits structured sparsity models to obtain substantial improvement over the existing state-of-the-art. We evaluate our method for separation and recognition of a target speaker in a multi-party scenario. Our results provide compelling evidence of the effectiveness of sparse recovery formulations in speech recognition.},
projects = {Idiap}
}

@techreport{Rasipuram_Idiap-RR-02-2011,
title = {INTEGRATING ARTICULATORY FEATURES USING KULLBACK-LEIBLER DIVERGENCE BASED ACOUSTIC MODEL FOR PHONEME RECOGNITION},
author = {Rasipuram, Ramya and Magimai.-Doss, Mathew},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-02-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {In this paper, we propose a novel framework to integrate articulatory features (AFs) into HMM- based ASR system. This is achieved by using posterior probabilities of different AFs (estimated by multilayer perceptrons) directly as observation features in Kullback-Leibler divergence based HMM (KL-HMM) system. On the TIMIT phoneme recognition task, the proposed framework yields a phoneme recognition accuracy of 72.4\% which is comparable to KL-HMM system using posterior probabilities of phonemes as features (72.7\%). Furthermore, a best performance of 73.5\% phoneme recognition accuracy is achieved by jointly modelling AF probabilities and phoneme probabilities as features. This shows the efficacy and flexibility of the proposed approach.},
projects = {Idiap},
}

@techreport{Duffner_Idiap-RR-01-2011,
title = {Exploiting Long-Term Observations for Track Creation and Deletion in Online Multi-Face Tracking},
author = {Duffner, Stefan and Odobez, Jean-Marc},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-01-2011},
institution = {Idiap},
address = {Rue Marconi 19, CH-1920 Martigny},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {In many visual multi-object tracking applications, the question when to add or remove a target is not trivial due to, for example, erroneous outputs of object detectors or observation models that cannot describe the full variability of the objects to track. In this paper, we present a real-time, online multi-face tracking algorithm that effectively deals with missing or uncertain detections in a principled way. To this end, we propose to use long-term image observations, and an explicit probabilistic filtering framework that decides when to add or remove a target from the tracker. We evaluated the proposed method on three different difficult datasets with a total length of almost 10 hours and show a significant increase in performance of the tracking.},
projects = {Idiap,
TA2},
}

@techreport{Pinto_Idiap-RR-39-2010,
title = {Hierarchical Tandem Features for ASR in Mandarin},
author = {Pinto, Joel Praveen and Magimai.-Doss, Mathew and Bourlard, Hervé},
year = {2010},
type = {Idiap-RR},
number = {Idiap-RR-39-2010},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {We apply multilayer perceptron (MLP) based hierarchical Tandem features to large vocabulary continuous speech recognition in Mandarin. Hierarchical Tandem features are estimated using a cascade of two MLP classifiers which are trained independently. The first classifier is trained on perceptual linear predictive coefficients with a 90 ms temporal context. The second classifier is trained using the phonetic class conditional probabilities estimated by the first MLP, but with a relatively longer temporal context of about 150 ms. Experiments on the Mandarin DARPA GALE eval06 dataset show significant reduction (about 7.6\% relative) in character error rates by using hierarchical Tandem features over conventional Tandem features.},
projects = {Idiap,
SNSF-KEYSPOT,
IM2},
}

@techreport{Varadarajan_Idiap-RR-36-2010,
title = {A Sparsity Constraint for Topic Models - Application to Temporal Activity Mining},
year = {2010},
type = {Idiap-RR},
number = {Idiap-RR-36-2010},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {We address the mining of sequential activity patterns from document logs given as word-time occurrences. We achieve this using topics that models both the cooccurrence and the temporal order in which words occur within a temporal window. Discovering such topics, which is particularly hard when multiple activities can occur simultaneously, is conducted through the joint inference of the temporal topics and of their starting times, allowing the implicit alignment of the same activity occurences in the document. A current issue is that while we would like topic starting times to be represented by sparse distributions, this is not achieved in practice. Thus, in this paper, we propose a method that encourages sparsity, by adding regularization constraints on the searched distributions. The constraints can be used with most topic models (e.g. PLSA, LDA) and lead to a simple modified version of the EM standard optimization procedure. The effect of the sparsity constraint on our activity model and the robustness improvment in the presence of difference noises have been validated on synthetic data. Its effectiveness is also illustrated in video activity analysis, where the discovered topics capture frequent patterns that implicitly represent typical trajectories of scene objects.},
projects = {Idiap},
}

@techreport{Imseng_Idiap-RR-35-2010,
title = {Tuning-Robust Initialization Methods for Speaker Diarization},
author = {Imseng, David and Friedland, Gerald},
year = {2010},
type = {Idiap-RR},
number = {Idiap-RR-35-2010},
institution = {Idiap},
address = {Centre du Parc, Rue Marconi 19, Case Postale 592, CH-1920 Martigny},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
abstract = {This paper investigates a typical Speaker Diarization system regarding its robustness against initialization parameter variation and presents a method to reduce manual tuning of these values significantly. The behavior of an agglomerative hierarchical clustering system is studied to determine which initialization parameters impact accuracy most. We show that the accuracy of typical systems is indeed very sensitive to the values chosen for the initialization parameters and factors such as the length of the recording. We then present a solution that reduces the sensitivity of the initialization values and therefore reduces the need for manual tuning significantly while at the same time increasing the accuracy of the system. For short meetings extracted from the previous (2006 and 2007) National Institute of Standards and Technology (NIST) Rich Transcription (RT) evaluation data, the decrease of the Diarization Error Rate is up to 50\% relative. The approach consists of a novel initial parameter estimation method for Speaker Diarization that uses agglomerative clustering with Bayesian Information Criterion (BIC) and Gaussian Mixture Models (GMMs) of frame-based cepstral features (MFCCs). The estimation method leverages the relationship between the optimal value of the seconds of speech data per Gaussian and the duration of the speech data and is combined with a novel non-uniform initialization method. This approach results in a system that performs better than the current ICSI baseline engine on datasets of the NIST RT evaluations of the years 2006 and 2007.},
projects = {Idiap,
IM2},
}

@techreport{Dines_Idiap-RR-34-2010,
title = {Measuring the gap between HMM-based ASR and TTS},
author = {Dines, John and Yamagishi, Junichi and King, Simon},
year = {2010},
type = {Idiap-RR},
number = {Idiap-RR-34-2010},
institution = {Idiap},
keywords = {report_X, IM2.IP1, Group Bourlard, techreport},
projects = {Idiap,
EMIME},
}

@techreport{Popescu-Belis_Idiap-RR-32-2011,
title = {Finding Information in Multimedia Records of Meetings},
author = {Popescu-Belis, Andrei and Lalanne, Denis and Bourlard, Hervé},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-32-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP2, Group Bourlard, techreport},
abstract = {This paper overviews the work carried out within two large consortia on improving the access to records of human meetings using multimodal interfaces. The design of meeting browsers has emerged as an important goal, with both theoretical interest and practical applications. Meeting browsers are assistance tools that help humans navigate through multimedia records of meetings (audio, video, documents, and metadata), in order to obtain a general idea about what happened in a meeting or to find specific pieces of information, for discovery or verification. To explain the importance that meeting browsers have gained in time, the paper summarizes findings of user studies, discusses features of meeting browser prototypes, and outlines the main evaluation protocol proposed. Reference scores are provided for future benchmarking. These achievements in meeting browsing constitute an iterative software process, from user studies to prototypes and then to products.},
projects = {Idiap,
IM2,
AMIDA},
}

@techreport{Popescu-Belis_Idiap-RR-31-2011,
title = {A Speech-based Just-in-Time Retrieval System using Semantic Search},
author = {Popescu-Belis, Andrei and Yazdani, Majid and Nanchen, Alexandre and Garner, Philip N. },
crossref = {Popescu-Belis_ACL-HLT2011_2011},
year = {2011},
type = {Idiap-RR},
number = {Idiap-RR-31-2011},
institution = {Idiap},
keywords = {report_X, IM2.IP2, Group Bourlard, techreport},
projects = {Idiap,
AMIDA,
IM2},
pdf = {https://publidiap.idiap.ch/downloads//reports/2011/Popescu-Belis_Idiap-R