@inproceedings{sterner-teufel-2023-tongueswitcher,
title = "{T}ongue{S}witcher: Fine-Grained Identification of {G}erman-{E}nglish Code-Switching",
author = "Sterner, Igor and
Teufel, Simone",
editor = "Winata, Genta and
Kar, Sudipta and
Zhukova, Marina and
Solorio, Thamar and
Diab, Mona and
Sitaram, Sunayana and
Choudhury, Monojit and
Bali, Kalika",
booktitle = "Proceedings of the 6th Workshop on Computational Approaches to Linguistic Code-Switching",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.calcs-1.1",
doi = "10.18653/v1/2023.calcs-1.1",
pages = "1--13",
abstract = "This paper contributes to German-English code-switching research. We provide the largest corpus of naturally occurring German-English code-switching, where English is included in German text, and two methods for code-switching identification. The first method is rule-based, using wordlists and morphological processing. We use this method to compile a corpus of 25.6M tweets employing German-English code-switching. In our second method, we continue pretraining of a neural language model on this corpus and classify tokens based on embeddings from this language model. Our systems establish SoTA on our new corpus and an existing German-English code-switching benchmark. In particular, we systematically study code-switching for language-ambiguous words which can only be resolved in context, and morphologically mixed words consisting of both English and German morphemes. We distribute both corpora and systems to the research community.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sterner-teufel-2023-tongueswitcher">
<titleInfo>
<title>TongueSwitcher: Fine-Grained Identification of German-English Code-Switching</title>
</titleInfo>
<name type="personal">
<namePart type="given">Igor</namePart>
<namePart type="family">Sterner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simone</namePart>
<namePart type="family">Teufel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Workshop on Computational Approaches to Linguistic Code-Switching</title>
</titleInfo>
<name type="personal">
<namePart type="given">Genta</namePart>
<namePart type="family">Winata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sudipta</namePart>
<namePart type="family">Kar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marina</namePart>
<namePart type="family">Zhukova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thamar</namePart>
<namePart type="family">Solorio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mona</namePart>
<namePart type="family">Diab</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sunayana</namePart>
<namePart type="family">Sitaram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Monojit</namePart>
<namePart type="family">Choudhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper contributes to German-English code-switching research. We provide the largest corpus of naturally occurring German-English code-switching, where English is included in German text, and two methods for code-switching identification. The first method is rule-based, using wordlists and morphological processing. We use this method to compile a corpus of 25.6M tweets employing German-English code-switching. In our second method, we continue pretraining of a neural language model on this corpus and classify tokens based on embeddings from this language model. Our systems establish SoTA on our new corpus and an existing German-English code-switching benchmark. In particular, we systematically study code-switching for language-ambiguous words which can only be resolved in context, and morphologically mixed words consisting of both English and German morphemes. We distribute both corpora and systems to the research community.</abstract>
<identifier type="citekey">sterner-teufel-2023-tongueswitcher</identifier>
<identifier type="doi">10.18653/v1/2023.calcs-1.1</identifier>
<location>
<url>https://aclanthology.org/2023.calcs-1.1</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>1</start>
<end>13</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TongueSwitcher: Fine-Grained Identification of German-English Code-Switching
%A Sterner, Igor
%A Teufel, Simone
%Y Winata, Genta
%Y Kar, Sudipta
%Y Zhukova, Marina
%Y Solorio, Thamar
%Y Diab, Mona
%Y Sitaram, Sunayana
%Y Choudhury, Monojit
%Y Bali, Kalika
%S Proceedings of the 6th Workshop on Computational Approaches to Linguistic Code-Switching
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F sterner-teufel-2023-tongueswitcher
%X This paper contributes to German-English code-switching research. We provide the largest corpus of naturally occurring German-English code-switching, where English is included in German text, and two methods for code-switching identification. The first method is rule-based, using wordlists and morphological processing. We use this method to compile a corpus of 25.6M tweets employing German-English code-switching. In our second method, we continue pretraining of a neural language model on this corpus and classify tokens based on embeddings from this language model. Our systems establish SoTA on our new corpus and an existing German-English code-switching benchmark. In particular, we systematically study code-switching for language-ambiguous words which can only be resolved in context, and morphologically mixed words consisting of both English and German morphemes. We distribute both corpora and systems to the research community.
%R 10.18653/v1/2023.calcs-1.1
%U https://aclanthology.org/2023.calcs-1.1
%U https://doi.org/10.18653/v1/2023.calcs-1.1
%P 1-13
Markdown (Informal)
[TongueSwitcher: Fine-Grained Identification of German-English Code-Switching](https://aclanthology.org/2023.calcs-1.1) (Sterner & Teufel, CALCS-WS 2023)
ACL