@inproceedings{abid-2020-sadid,
title = "The {SADID} Evaluation Datasets for Low-Resource Spoken Language Machine Translation of {A}rabic Dialects",
author = "Abid, Wael",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.530",
doi = "10.18653/v1/2020.coling-main.530",
pages = "6030--6043",
abstract = "Low-resource Machine Translation recently gained a lot of popularity, and for certain languages, it has made great strides. However, it is still difficult to track progress in other languages for which there is no publicly available evaluation data. In this paper, we introduce benchmark datasets for Arabic and its dialects. We describe our design process and motivations and analyze the datasets to understand their resulting properties. Numerous successful attempts use large monolingual corpora to augment low-resource pairs. We try to approach augmentation differently and investigate whether it is possible to improve MT models without any external sources of data. We accomplish this by bootstrapping existing parallel sentences and complement this with multilingual training to achieve strong baselines.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abid-2020-sadid">
<titleInfo>
<title>The SADID Evaluation Datasets for Low-Resource Spoken Language Machine Translation of Arabic Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wael</namePart>
<namePart type="family">Abid</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Low-resource Machine Translation recently gained a lot of popularity, and for certain languages, it has made great strides. However, it is still difficult to track progress in other languages for which there is no publicly available evaluation data. In this paper, we introduce benchmark datasets for Arabic and its dialects. We describe our design process and motivations and analyze the datasets to understand their resulting properties. Numerous successful attempts use large monolingual corpora to augment low-resource pairs. We try to approach augmentation differently and investigate whether it is possible to improve MT models without any external sources of data. We accomplish this by bootstrapping existing parallel sentences and complement this with multilingual training to achieve strong baselines.</abstract>
<identifier type="citekey">abid-2020-sadid</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.530</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.530</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>6030</start>
<end>6043</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The SADID Evaluation Datasets for Low-Resource Spoken Language Machine Translation of Arabic Dialects
%A Abid, Wael
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F abid-2020-sadid
%X Low-resource Machine Translation recently gained a lot of popularity, and for certain languages, it has made great strides. However, it is still difficult to track progress in other languages for which there is no publicly available evaluation data. In this paper, we introduce benchmark datasets for Arabic and its dialects. We describe our design process and motivations and analyze the datasets to understand their resulting properties. Numerous successful attempts use large monolingual corpora to augment low-resource pairs. We try to approach augmentation differently and investigate whether it is possible to improve MT models without any external sources of data. We accomplish this by bootstrapping existing parallel sentences and complement this with multilingual training to achieve strong baselines.
%R 10.18653/v1/2020.coling-main.530
%U https://aclanthology.org/2020.coling-main.530
%U https://doi.org/10.18653/v1/2020.coling-main.530
%P 6030-6043
Markdown (Informal)
[The SADID Evaluation Datasets for Low-Resource Spoken Language Machine Translation of Arabic Dialects](https://aclanthology.org/2020.coling-main.530) (Abid, COLING 2020)
ACL