@inproceedings{abulimiti-schultz-2020-building,
title = "Building Language Models for Morphological Rich Low-Resource Languages using Data from Related Donor Languages: the Case of {U}yghur",
author = "Abulimiti, Ayimunishagu and
Schultz, Tanja",
editor = "Beermann, Dorothee and
Besacier, Laurent and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources association",
url = "https://aclanthology.org/2020.sltu-1.38",
pages = "271--276",
abstract = "Huge amounts of data are needed to build reliable statistical language models. Automatic speech processing tasks in low-resource languages typically suffer from lower performances due to weak or unreliable language models. Furthermore, language modeling for agglutinative languages is very challenging, as the morphological richness results in higher Out Of Vocabulary (OOV) rate. In this work, we show our effort to build word-based as well as morpheme-based language models for Uyghur, a language that combines both challenges, i.e. it is a low-resource and agglutinative language. Fortunately, there exists a closely-related rich-resource language, namely Turkish. Here, we present our work on leveraging Turkish text data to improve Uyghur language models. To maximize the overlap between Uyghur and Turkish words, the Turkish data is pre-processed on the word surface level, which results in 7.76{\%} OOV-rate reduction on the Uyghur development set. To investigate various levels of low-resource conditions, different subsets of Uyghur data are generated. Morpheme-based language models trained with bilingual data achieved up to 40.91{\%} relative perplexity reduction over the language models trained only with Uyghur data.",
language = "English",
ISBN = "979-10-95546-35-1",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abulimiti-schultz-2020-building">
<titleInfo>
<title>Building Language Models for Morphological Rich Low-Resource Languages using Data from Related Donor Languages: the Case of Uyghur</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ayimunishagu</namePart>
<namePart type="family">Abulimiti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanja</namePart>
<namePart type="family">Schultz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dorothee</namePart>
<namePart type="family">Beermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laurent</namePart>
<namePart type="family">Besacier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-35-1</identifier>
</relatedItem>
<abstract>Huge amounts of data are needed to build reliable statistical language models. Automatic speech processing tasks in low-resource languages typically suffer from lower performances due to weak or unreliable language models. Furthermore, language modeling for agglutinative languages is very challenging, as the morphological richness results in higher Out Of Vocabulary (OOV) rate. In this work, we show our effort to build word-based as well as morpheme-based language models for Uyghur, a language that combines both challenges, i.e. it is a low-resource and agglutinative language. Fortunately, there exists a closely-related rich-resource language, namely Turkish. Here, we present our work on leveraging Turkish text data to improve Uyghur language models. To maximize the overlap between Uyghur and Turkish words, the Turkish data is pre-processed on the word surface level, which results in 7.76% OOV-rate reduction on the Uyghur development set. To investigate various levels of low-resource conditions, different subsets of Uyghur data are generated. Morpheme-based language models trained with bilingual data achieved up to 40.91% relative perplexity reduction over the language models trained only with Uyghur data.</abstract>
<identifier type="citekey">abulimiti-schultz-2020-building</identifier>
<location>
<url>https://aclanthology.org/2020.sltu-1.38</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>271</start>
<end>276</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Building Language Models for Morphological Rich Low-Resource Languages using Data from Related Donor Languages: the Case of Uyghur
%A Abulimiti, Ayimunishagu
%A Schultz, Tanja
%Y Beermann, Dorothee
%Y Besacier, Laurent
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)
%D 2020
%8 May
%I European Language Resources association
%C Marseille, France
%@ 979-10-95546-35-1
%G English
%F abulimiti-schultz-2020-building
%X Huge amounts of data are needed to build reliable statistical language models. Automatic speech processing tasks in low-resource languages typically suffer from lower performances due to weak or unreliable language models. Furthermore, language modeling for agglutinative languages is very challenging, as the morphological richness results in higher Out Of Vocabulary (OOV) rate. In this work, we show our effort to build word-based as well as morpheme-based language models for Uyghur, a language that combines both challenges, i.e. it is a low-resource and agglutinative language. Fortunately, there exists a closely-related rich-resource language, namely Turkish. Here, we present our work on leveraging Turkish text data to improve Uyghur language models. To maximize the overlap between Uyghur and Turkish words, the Turkish data is pre-processed on the word surface level, which results in 7.76% OOV-rate reduction on the Uyghur development set. To investigate various levels of low-resource conditions, different subsets of Uyghur data are generated. Morpheme-based language models trained with bilingual data achieved up to 40.91% relative perplexity reduction over the language models trained only with Uyghur data.
%U https://aclanthology.org/2020.sltu-1.38
%P 271-276
Markdown (Informal)
[Building Language Models for Morphological Rich Low-Resource Languages using Data from Related Donor Languages: the Case of Uyghur](https://aclanthology.org/2020.sltu-1.38) (Abulimiti & Schultz, SLTU 2020)
ACL