@inproceedings{hossain-hoque-2020-towards,
title = "Towards {B}engali Word Embedding: Corpus Creation, Intrinsic and Extrinsic Evaluations",
author = "Hossain, Md. Rajib and
Hoque, Mohammed Moshiul",
editor = "Bhattacharyya, Pushpak and
Sharma, Dipti Misra and
Sangal, Rajeev",
booktitle = "Proceedings of the 17th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2020",
address = "Indian Institute of Technology Patna, Patna, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://aclanthology.org/2020.icon-main.61",
pages = "453--459",
abstract = "Distributional word vector representation or word embedding has become an essential ingredient in many natural language processing (NLP) tasks such as machine translation, document classification, information retrieval and question answering. Investigation of embedding model helps to reduce the feature space and improves textual semantic as well as syntactic relations. This paper presents three embedding techniques (such as Word2Vec, GloVe, and FastText) with different hyperparameters implemented on a Bengali corpus consists of 180 million words. The performance of the embedding techniques is evaluated with extrinsic and intrinsic ways. Extrinsic performance evaluated by text classification, which achieved a maximum of 96.48{\%} accuracy. Intrinsic performance evaluated by word similarity (e.g., semantic, syntactic and relatedness) and analogy tasks. The maximum Pearson (r{\^{}}) correlation accuracy of 60.66{\%} (Ssr{\^{}}) achieved for semantic similarities and 71.64{\%} (Syr{\^{}}) for syntactic similarities whereas the relatedness obtained 79.80{\%} (Rsr{\^{}}). The semantic word analogy tasks achieved 44.00{\%} of accuracy while syntactic word analogy tasks obtained 36.00{\%}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hossain-hoque-2020-towards">
<titleInfo>
<title>Towards Bengali Word Embedding: Corpus Creation, Intrinsic and Extrinsic Evaluations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Rajib</namePart>
<namePart type="family">Hossain</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammed</namePart>
<namePart type="given">Moshiul</namePart>
<namePart type="family">Hoque</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 17th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dipti</namePart>
<namePart type="given">Misra</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rajeev</namePart>
<namePart type="family">Sangal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>NLP Association of India (NLPAI)</publisher>
<place>
<placeTerm type="text">Indian Institute of Technology Patna, Patna, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Distributional word vector representation or word embedding has become an essential ingredient in many natural language processing (NLP) tasks such as machine translation, document classification, information retrieval and question answering. Investigation of embedding model helps to reduce the feature space and improves textual semantic as well as syntactic relations. This paper presents three embedding techniques (such as Word2Vec, GloVe, and FastText) with different hyperparameters implemented on a Bengali corpus consists of 180 million words. The performance of the embedding techniques is evaluated with extrinsic and intrinsic ways. Extrinsic performance evaluated by text classification, which achieved a maximum of 96.48% accuracy. Intrinsic performance evaluated by word similarity (e.g., semantic, syntactic and relatedness) and analogy tasks. The maximum Pearson (r\⁾ correlation accuracy of 60.66% (Ssr\⁾ achieved for semantic similarities and 71.64% (Syr\⁾ for syntactic similarities whereas the relatedness obtained 79.80% (Rsr\⁾. The semantic word analogy tasks achieved 44.00% of accuracy while syntactic word analogy tasks obtained 36.00%.</abstract>
<identifier type="citekey">hossain-hoque-2020-towards</identifier>
<location>
<url>https://aclanthology.org/2020.icon-main.61</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>453</start>
<end>459</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Towards Bengali Word Embedding: Corpus Creation, Intrinsic and Extrinsic Evaluations
%A Hossain, Md. Rajib
%A Hoque, Mohammed Moshiul
%Y Bhattacharyya, Pushpak
%Y Sharma, Dipti Misra
%Y Sangal, Rajeev
%S Proceedings of the 17th International Conference on Natural Language Processing (ICON)
%D 2020
%8 December
%I NLP Association of India (NLPAI)
%C Indian Institute of Technology Patna, Patna, India
%F hossain-hoque-2020-towards
%X Distributional word vector representation or word embedding has become an essential ingredient in many natural language processing (NLP) tasks such as machine translation, document classification, information retrieval and question answering. Investigation of embedding model helps to reduce the feature space and improves textual semantic as well as syntactic relations. This paper presents three embedding techniques (such as Word2Vec, GloVe, and FastText) with different hyperparameters implemented on a Bengali corpus consists of 180 million words. The performance of the embedding techniques is evaluated with extrinsic and intrinsic ways. Extrinsic performance evaluated by text classification, which achieved a maximum of 96.48% accuracy. Intrinsic performance evaluated by word similarity (e.g., semantic, syntactic and relatedness) and analogy tasks. The maximum Pearson (r\⁾ correlation accuracy of 60.66% (Ssr\⁾ achieved for semantic similarities and 71.64% (Syr\⁾ for syntactic similarities whereas the relatedness obtained 79.80% (Rsr\⁾. The semantic word analogy tasks achieved 44.00% of accuracy while syntactic word analogy tasks obtained 36.00%.
%U https://aclanthology.org/2020.icon-main.61
%P 453-459
Markdown (Informal)
[Towards Bengali Word Embedding: Corpus Creation, Intrinsic and Extrinsic Evaluations](https://aclanthology.org/2020.icon-main.61) (Hossain & Hoque, ICON 2020)
ACL