@inproceedings{yuan-etal-2022-data,
title = "Data Augmentation for the Post-Stroke Speech Transcription ({PSST}) Challenge: Sometimes Less Is More",
author = "Yuan, Jiahong and
Cai, Xingyu and
Church, Kenneth",
editor = "Kokkinakis, Dimitrios and
Themistocleous, Charalambos K. and
Fors, Kristina Lundholm and
Tsanas, Athanasios and
Fraser, Kathleen C.",
booktitle = "Proceedings of the RaPID Workshop - Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric/developmental impairments - within the 13th Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.rapid-1.9",
pages = "71--79",
abstract = "We employ the method of fine-tuning wav2vec2.0 for recognition of phonemes in aphasic speech. Our effort focuses on data augmentation, by supplementing data from both in-domain and out-of-domain datasets for training. We found that although a modest amount of out-of-domain data may be helpful, the performance of the model degrades significantly when the amount of out-of-domain data is much larger than in-domain data. Our hypothesis is that fine-tuning wav2vec2.0 with a CTC loss not only learns bottom-up acoustic properties but also top-down constraints. Therefore, out-of-domain data augmentation is likely to degrade performance if there is a language model mismatch between {``}in{''} and {``}out{''} domains. For in-domain audio without ground truth labels, we found that it is beneficial to exclude samples with less confident pseudo labels. Our final model achieves 16.7{\%} PER (phoneme error rate) on the validation set, without using a language model for decoding. The result represents a relative error reduction of 14{\%} over the baseline model trained without data augmentation. Finally, we found that {``}canonicalized{''} phonemes are much easier to recognize than manually transcribed phonemes.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yuan-etal-2022-data">
<titleInfo>
<title>Data Augmentation for the Post-Stroke Speech Transcription (PSST) Challenge: Sometimes Less Is More</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiahong</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xingyu</namePart>
<namePart type="family">Cai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenneth</namePart>
<namePart type="family">Church</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the RaPID Workshop - Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric/developmental impairments - within the 13th Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dimitrios</namePart>
<namePart type="family">Kokkinakis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Charalambos</namePart>
<namePart type="given">K</namePart>
<namePart type="family">Themistocleous</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kristina</namePart>
<namePart type="given">Lundholm</namePart>
<namePart type="family">Fors</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Athanasios</namePart>
<namePart type="family">Tsanas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kathleen</namePart>
<namePart type="given">C</namePart>
<namePart type="family">Fraser</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We employ the method of fine-tuning wav2vec2.0 for recognition of phonemes in aphasic speech. Our effort focuses on data augmentation, by supplementing data from both in-domain and out-of-domain datasets for training. We found that although a modest amount of out-of-domain data may be helpful, the performance of the model degrades significantly when the amount of out-of-domain data is much larger than in-domain data. Our hypothesis is that fine-tuning wav2vec2.0 with a CTC loss not only learns bottom-up acoustic properties but also top-down constraints. Therefore, out-of-domain data augmentation is likely to degrade performance if there is a language model mismatch between “in” and “out” domains. For in-domain audio without ground truth labels, we found that it is beneficial to exclude samples with less confident pseudo labels. Our final model achieves 16.7% PER (phoneme error rate) on the validation set, without using a language model for decoding. The result represents a relative error reduction of 14% over the baseline model trained without data augmentation. Finally, we found that “canonicalized” phonemes are much easier to recognize than manually transcribed phonemes.</abstract>
<identifier type="citekey">yuan-etal-2022-data</identifier>
<location>
<url>https://aclanthology.org/2022.rapid-1.9</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>71</start>
<end>79</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Data Augmentation for the Post-Stroke Speech Transcription (PSST) Challenge: Sometimes Less Is More
%A Yuan, Jiahong
%A Cai, Xingyu
%A Church, Kenneth
%Y Kokkinakis, Dimitrios
%Y Themistocleous, Charalambos K.
%Y Fors, Kristina Lundholm
%Y Tsanas, Athanasios
%Y Fraser, Kathleen C.
%S Proceedings of the RaPID Workshop - Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric/developmental impairments - within the 13th Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F yuan-etal-2022-data
%X We employ the method of fine-tuning wav2vec2.0 for recognition of phonemes in aphasic speech. Our effort focuses on data augmentation, by supplementing data from both in-domain and out-of-domain datasets for training. We found that although a modest amount of out-of-domain data may be helpful, the performance of the model degrades significantly when the amount of out-of-domain data is much larger than in-domain data. Our hypothesis is that fine-tuning wav2vec2.0 with a CTC loss not only learns bottom-up acoustic properties but also top-down constraints. Therefore, out-of-domain data augmentation is likely to degrade performance if there is a language model mismatch between “in” and “out” domains. For in-domain audio without ground truth labels, we found that it is beneficial to exclude samples with less confident pseudo labels. Our final model achieves 16.7% PER (phoneme error rate) on the validation set, without using a language model for decoding. The result represents a relative error reduction of 14% over the baseline model trained without data augmentation. Finally, we found that “canonicalized” phonemes are much easier to recognize than manually transcribed phonemes.
%U https://aclanthology.org/2022.rapid-1.9
%P 71-79
Markdown (Informal)
[Data Augmentation for the Post-Stroke Speech Transcription (PSST) Challenge: Sometimes Less Is More](https://aclanthology.org/2022.rapid-1.9) (Yuan et al., RaPID 2022)
ACL