@inproceedings{fields-kennington-2023-exploring,
title = "Exploring Transformers as Compact, Data-efficient Language Models",
author = "Fields, Clayton and
Kennington, Casey",
editor = "Jiang, Jing and
Reitter, David and
Deng, Shumin",
booktitle = "Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.conll-1.35",
doi = "10.18653/v1/2023.conll-1.35",
pages = "521--531",
abstract = "Large scale transformer models, trained with massive datasets have become the standard in natural language processing. The huge size of most transformers make research with these models impossible for those with limited computational resources. Additionally, the enormous pretraining data requirements of transformers exclude pretraining them with many smaller datasets that might provide enlightening results. In this study, we show that transformers can be significantly reduced in size, with as few as 5.7 million parameters, and still retain most of their downstream capability. Further we show that transformer models can retain comparable results when trained on human-scale datasets, as few as 5 million words of pretraining data. Overall, the results of our study suggest transformers function well as compact, data efficient language models and that complex model compression methods, such as model distillation are not necessarily superior to pretraining reduced size transformer models from scratch.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="fields-kennington-2023-exploring">
<titleInfo>
<title>Exploring Transformers as Compact, Data-efficient Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Clayton</namePart>
<namePart type="family">Fields</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Casey</namePart>
<namePart type="family">Kennington</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jing</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Reitter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shumin</namePart>
<namePart type="family">Deng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Large scale transformer models, trained with massive datasets have become the standard in natural language processing. The huge size of most transformers make research with these models impossible for those with limited computational resources. Additionally, the enormous pretraining data requirements of transformers exclude pretraining them with many smaller datasets that might provide enlightening results. In this study, we show that transformers can be significantly reduced in size, with as few as 5.7 million parameters, and still retain most of their downstream capability. Further we show that transformer models can retain comparable results when trained on human-scale datasets, as few as 5 million words of pretraining data. Overall, the results of our study suggest transformers function well as compact, data efficient language models and that complex model compression methods, such as model distillation are not necessarily superior to pretraining reduced size transformer models from scratch.</abstract>
<identifier type="citekey">fields-kennington-2023-exploring</identifier>
<identifier type="doi">10.18653/v1/2023.conll-1.35</identifier>
<location>
<url>https://aclanthology.org/2023.conll-1.35</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>521</start>
<end>531</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploring Transformers as Compact, Data-efficient Language Models
%A Fields, Clayton
%A Kennington, Casey
%Y Jiang, Jing
%Y Reitter, David
%Y Deng, Shumin
%S Proceedings of the 27th Conference on Computational Natural Language Learning (CoNLL)
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F fields-kennington-2023-exploring
%X Large scale transformer models, trained with massive datasets have become the standard in natural language processing. The huge size of most transformers make research with these models impossible for those with limited computational resources. Additionally, the enormous pretraining data requirements of transformers exclude pretraining them with many smaller datasets that might provide enlightening results. In this study, we show that transformers can be significantly reduced in size, with as few as 5.7 million parameters, and still retain most of their downstream capability. Further we show that transformer models can retain comparable results when trained on human-scale datasets, as few as 5 million words of pretraining data. Overall, the results of our study suggest transformers function well as compact, data efficient language models and that complex model compression methods, such as model distillation are not necessarily superior to pretraining reduced size transformer models from scratch.
%R 10.18653/v1/2023.conll-1.35
%U https://aclanthology.org/2023.conll-1.35
%U https://doi.org/10.18653/v1/2023.conll-1.35
%P 521-531
Markdown (Informal)
[Exploring Transformers as Compact, Data-efficient Language Models](https://aclanthology.org/2023.conll-1.35) (Fields & Kennington, CoNLL 2023)
ACL