@inproceedings{woldemariam-dahlgren-2020-adapting,
title = "Adapting Language Specific Components of Cross-Media Analysis Frameworks to Less-Resourced Languages: the Case of {A}mharic",
author = "Woldemariam, Yonas and
Dahlgren, Adam",
editor = "Beermann, Dorothee and
Besacier, Laurent and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources association",
url = "https://aclanthology.org/2020.sltu-1.42",
pages = "298--305",
abstract = "We present an ASR based pipeline for Amharic that orchestrates NLP components within a cross media analysis framework (CMAF). One of the major challenges that are inherently associated with CMAFs is effectively addressing multi-lingual issues. As a result, many languages remain under-resourced and fail to leverage out of available media analysis solutions. Although spoken natively by over 22 million people and there is an ever-increasing amount of Amharic multimedia content on the Web, querying them with simple text search is difficult. Searching for, especially audio/video content with simple key words, is even hard as they exist in their raw form. In this study, we introduce a spoken and textual content processing workflow into a CMAF for Amharic. We design an ASR-named entity recognition (NER) pipeline that includes three main components: ASR, a transliterator and NER. We explore various acoustic modeling techniques and develop an OpenNLP-based NER extractor along with a transliterator that interfaces between ASR and NER. The designed ASR-NER pipeline for Amharic promotes the multi-lingual support of CMAFs. Also, the state-of-the art design principles and techniques employed in this study shed light for other less-resourced languages, particularly the Semitic ones.",
language = "English",
ISBN = "979-10-95546-35-1",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="woldemariam-dahlgren-2020-adapting">
<titleInfo>
<title>Adapting Language Specific Components of Cross-Media Analysis Frameworks to Less-Resourced Languages: the Case of Amharic</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yonas</namePart>
<namePart type="family">Woldemariam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adam</namePart>
<namePart type="family">Dahlgren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dorothee</namePart>
<namePart type="family">Beermann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laurent</namePart>
<namePart type="family">Besacier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-35-1</identifier>
</relatedItem>
<abstract>We present an ASR based pipeline for Amharic that orchestrates NLP components within a cross media analysis framework (CMAF). One of the major challenges that are inherently associated with CMAFs is effectively addressing multi-lingual issues. As a result, many languages remain under-resourced and fail to leverage out of available media analysis solutions. Although spoken natively by over 22 million people and there is an ever-increasing amount of Amharic multimedia content on the Web, querying them with simple text search is difficult. Searching for, especially audio/video content with simple key words, is even hard as they exist in their raw form. In this study, we introduce a spoken and textual content processing workflow into a CMAF for Amharic. We design an ASR-named entity recognition (NER) pipeline that includes three main components: ASR, a transliterator and NER. We explore various acoustic modeling techniques and develop an OpenNLP-based NER extractor along with a transliterator that interfaces between ASR and NER. The designed ASR-NER pipeline for Amharic promotes the multi-lingual support of CMAFs. Also, the state-of-the art design principles and techniques employed in this study shed light for other less-resourced languages, particularly the Semitic ones.</abstract>
<identifier type="citekey">woldemariam-dahlgren-2020-adapting</identifier>
<location>
<url>https://aclanthology.org/2020.sltu-1.42</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>298</start>
<end>305</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Adapting Language Specific Components of Cross-Media Analysis Frameworks to Less-Resourced Languages: the Case of Amharic
%A Woldemariam, Yonas
%A Dahlgren, Adam
%Y Beermann, Dorothee
%Y Besacier, Laurent
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)
%D 2020
%8 May
%I European Language Resources association
%C Marseille, France
%@ 979-10-95546-35-1
%G English
%F woldemariam-dahlgren-2020-adapting
%X We present an ASR based pipeline for Amharic that orchestrates NLP components within a cross media analysis framework (CMAF). One of the major challenges that are inherently associated with CMAFs is effectively addressing multi-lingual issues. As a result, many languages remain under-resourced and fail to leverage out of available media analysis solutions. Although spoken natively by over 22 million people and there is an ever-increasing amount of Amharic multimedia content on the Web, querying them with simple text search is difficult. Searching for, especially audio/video content with simple key words, is even hard as they exist in their raw form. In this study, we introduce a spoken and textual content processing workflow into a CMAF for Amharic. We design an ASR-named entity recognition (NER) pipeline that includes three main components: ASR, a transliterator and NER. We explore various acoustic modeling techniques and develop an OpenNLP-based NER extractor along with a transliterator that interfaces between ASR and NER. The designed ASR-NER pipeline for Amharic promotes the multi-lingual support of CMAFs. Also, the state-of-the art design principles and techniques employed in this study shed light for other less-resourced languages, particularly the Semitic ones.
%U https://aclanthology.org/2020.sltu-1.42
%P 298-305
Markdown (Informal)
[Adapting Language Specific Components of Cross-Media Analysis Frameworks to Less-Resourced Languages: the Case of Amharic](https://aclanthology.org/2020.sltu-1.42) (Woldemariam & Dahlgren, SLTU 2020)
ACL