@inproceedings{gamage-etal-2021-improve,
title = "Improve {S}inhala Speech Recognition Through e2e {LF}-{MMI} Model",
author = "Gamage, Buddhi and
Pushpananda, Randil and
Nadungodage, Thilini and
Weerasinghe, Ruwan",
editor = "Bandyopadhyay, Sivaji and
Devi, Sobha Lalitha and
Bhattacharyya, Pushpak",
booktitle = "Proceedings of the 18th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2021",
address = "National Institute of Technology Silchar, Silchar, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2021.icon-main.26",
pages = "213--219",
abstract = "Automatic speech recognition (ASR) has experienced several paradigm shifts over the years from template-based approaches and statistical modeling to the popular GMM-HMM approach and then to deep learning hybrid model DNN-HMM. The latest shift is to end-to-end (e2e) DNN architecture. We present a study to build an e2e ASR system using state-of-the-art deep learning models to verify the applicability of e2e ASR models for the highly inflected and yet low-resource Sinhala language. We experimented on e2e Lattice-Free Maximum Mutual Information (e2e LF-MMI) model with the baseline statistical models with 40 hours of training data to evaluate. We used the same corpus for creating language models and lexicon in our previous study, which resulted in the best accuracy for the Sinhala language. We were able to achieve a Word-error-rate (WER) of 28.55{\%} for Sinhala, only slightly worse than the existing best hybrid model. Our model, however, is more context-independent and faster for Sinhala speech recognition and so more suitable for general purpose speech-to-text translation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gamage-etal-2021-improve">
<titleInfo>
<title>Improve Sinhala Speech Recognition Through e2e LF-MMI Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Buddhi</namePart>
<namePart type="family">Gamage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Randil</namePart>
<namePart type="family">Pushpananda</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thilini</namePart>
<namePart type="family">Nadungodage</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruwan</namePart>
<namePart type="family">Weerasinghe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sivaji</namePart>
<namePart type="family">Bandyopadhyay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sobha</namePart>
<namePart type="given">Lalitha</namePart>
<namePart type="family">Devi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">National Institute of Technology Silchar, Silchar, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic speech recognition (ASR) has experienced several paradigm shifts over the years from template-based approaches and statistical modeling to the popular GMM-HMM approach and then to deep learning hybrid model DNN-HMM. The latest shift is to end-to-end (e2e) DNN architecture. We present a study to build an e2e ASR system using state-of-the-art deep learning models to verify the applicability of e2e ASR models for the highly inflected and yet low-resource Sinhala language. We experimented on e2e Lattice-Free Maximum Mutual Information (e2e LF-MMI) model with the baseline statistical models with 40 hours of training data to evaluate. We used the same corpus for creating language models and lexicon in our previous study, which resulted in the best accuracy for the Sinhala language. We were able to achieve a Word-error-rate (WER) of 28.55% for Sinhala, only slightly worse than the existing best hybrid model. Our model, however, is more context-independent and faster for Sinhala speech recognition and so more suitable for general purpose speech-to-text translation.</abstract>
<identifier type="citekey">gamage-etal-2021-improve</identifier>
<location>
<url>https://aclanthology.org/2021.icon-main.26</url>
</location>
<part>
<date>2021-12</date>
<extent unit="page">
<start>213</start>
<end>219</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improve Sinhala Speech Recognition Through e2e LF-MMI Model
%A Gamage, Buddhi
%A Pushpananda, Randil
%A Nadungodage, Thilini
%A Weerasinghe, Ruwan
%Y Bandyopadhyay, Sivaji
%Y Devi, Sobha Lalitha
%Y Bhattacharyya, Pushpak
%S Proceedings of the 18th International Conference on Natural Language Processing (ICON)
%D 2021
%8 December
%I NLP Association of India (NLPAI)
%C National Institute of Technology Silchar, Silchar, India
%F gamage-etal-2021-improve
%X Automatic speech recognition (ASR) has experienced several paradigm shifts over the years from template-based approaches and statistical modeling to the popular GMM-HMM approach and then to deep learning hybrid model DNN-HMM. The latest shift is to end-to-end (e2e) DNN architecture. We present a study to build an e2e ASR system using state-of-the-art deep learning models to verify the applicability of e2e ASR models for the highly inflected and yet low-resource Sinhala language. We experimented on e2e Lattice-Free Maximum Mutual Information (e2e LF-MMI) model with the baseline statistical models with 40 hours of training data to evaluate. We used the same corpus for creating language models and lexicon in our previous study, which resulted in the best accuracy for the Sinhala language. We were able to achieve a Word-error-rate (WER) of 28.55% for Sinhala, only slightly worse than the existing best hybrid model. Our model, however, is more context-independent and faster for Sinhala speech recognition and so more suitable for general purpose speech-to-text translation.
%U https://aclanthology.org/2021.icon-main.26
%P 213-219
Markdown (Informal)
[Improve Sinhala Speech Recognition Through e2e LF-MMI Model](https://aclanthology.org/2021.icon-main.26) (Gamage et al., ICON 2021)
ACL
- Buddhi Gamage, Randil Pushpananda, Thilini Nadungodage, and Ruwan Weerasinghe. 2021. Improve Sinhala Speech Recognition Through e2e LF-MMI Model. In Proceedings of the 18th International Conference on Natural Language Processing (ICON), pages 213–219, National Institute of Technology Silchar, Silchar, India. NLP Association of India (NLPAI).