@inproceedings{raha-etal-2019-development,
title = "Development of {POS} tagger for {E}nglish-{B}engali Code-Mixed data",
author = "Raha, Tathagata and
Mahata, Sainik and
Das, Dipankar and
Bandyopadhyay, Sivaji",
editor = "Sharma, Dipti Misra and
Bhattacharya, Pushpak",
booktitle = "Proceedings of the 16th International Conference on Natural Language Processing",
month = dec,
year = "2019",
address = "International Institute of Information Technology, Hyderabad, India",
publisher = "NLP Association of India",
url = "https://aclanthology.org/2019.icon-1.17",
pages = "143--149",
abstract = "Code-mixed texts are widespread nowadays due to the advent of social media. Since these texts combine two languages to formulate a sentence, it gives rise to various research problems related to Natural Language Processing. In this paper, we try to excavate one such problem, namely, Parts of Speech tagging of code-mixed texts. We have built a system that can POS tag English-Bengali code-mixed data where the Bengali words were written in Roman script. Our approach initially involves the collection and cleaning of English-Bengali code-mixed tweets. These tweets were used as a development dataset for building our system. The proposed system is a modular approach that starts by tagging individual tokens with their respective languages and then passes them to different POS taggers, designed for different languages (English and Bengali, in our case). Tags given by the two systems are later joined together and the final result is then mapped to a universal POS tag set. Our system was checked using 100 manually POS tagged code-mixed sentences and it returned an accuracy of 75.29{\%}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="raha-etal-2019-development">
<titleInfo>
<title>Development of POS tagger for English-Bengali Code-Mixed data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tathagata</namePart>
<namePart type="family">Raha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sainik</namePart>
<namePart type="family">Mahata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dipankar</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sivaji</namePart>
<namePart type="family">Bandyopadhyay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th International Conference on Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dipti</namePart>
<namePart type="given">Misra</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India</publisher>
<place>
<placeTerm type="text">International Institute of Information Technology, Hyderabad, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Code-mixed texts are widespread nowadays due to the advent of social media. Since these texts combine two languages to formulate a sentence, it gives rise to various research problems related to Natural Language Processing. In this paper, we try to excavate one such problem, namely, Parts of Speech tagging of code-mixed texts. We have built a system that can POS tag English-Bengali code-mixed data where the Bengali words were written in Roman script. Our approach initially involves the collection and cleaning of English-Bengali code-mixed tweets. These tweets were used as a development dataset for building our system. The proposed system is a modular approach that starts by tagging individual tokens with their respective languages and then passes them to different POS taggers, designed for different languages (English and Bengali, in our case). Tags given by the two systems are later joined together and the final result is then mapped to a universal POS tag set. Our system was checked using 100 manually POS tagged code-mixed sentences and it returned an accuracy of 75.29%.</abstract>
<identifier type="citekey">raha-etal-2019-development</identifier>
<location>
<url>https://aclanthology.org/2019.icon-1.17</url>
</location>
<part>
<date>2019-12</date>
<extent unit="page">
<start>143</start>
<end>149</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Development of POS tagger for English-Bengali Code-Mixed data
%A Raha, Tathagata
%A Mahata, Sainik
%A Das, Dipankar
%A Bandyopadhyay, Sivaji
%Y Sharma, Dipti Misra
%Y Bhattacharya, Pushpak
%S Proceedings of the 16th International Conference on Natural Language Processing
%D 2019
%8 December
%I NLP Association of India
%C International Institute of Information Technology, Hyderabad, India
%F raha-etal-2019-development
%X Code-mixed texts are widespread nowadays due to the advent of social media. Since these texts combine two languages to formulate a sentence, it gives rise to various research problems related to Natural Language Processing. In this paper, we try to excavate one such problem, namely, Parts of Speech tagging of code-mixed texts. We have built a system that can POS tag English-Bengali code-mixed data where the Bengali words were written in Roman script. Our approach initially involves the collection and cleaning of English-Bengali code-mixed tweets. These tweets were used as a development dataset for building our system. The proposed system is a modular approach that starts by tagging individual tokens with their respective languages and then passes them to different POS taggers, designed for different languages (English and Bengali, in our case). Tags given by the two systems are later joined together and the final result is then mapped to a universal POS tag set. Our system was checked using 100 manually POS tagged code-mixed sentences and it returned an accuracy of 75.29%.
%U https://aclanthology.org/2019.icon-1.17
%P 143-149
Markdown (Informal)
[Development of POS tagger for English-Bengali Code-Mixed data](https://aclanthology.org/2019.icon-1.17) (Raha et al., ICON 2019)
ACL
- Tathagata Raha, Sainik Mahata, Dipankar Das, and Sivaji Bandyopadhyay. 2019. Development of POS tagger for English-Bengali Code-Mixed data. In Proceedings of the 16th International Conference on Natural Language Processing, pages 143–149, International Institute of Information Technology, Hyderabad, India. NLP Association of India.