@inproceedings{mohebbi-etal-2024-transformer,
title = "Transformer-specific Interpretability",
author = "Mohebbi, Hosein and
Jumelet, Jaap and
Hanna, Michael and
Alishahi, Afra and
Zuidema, Willem",
editor = "Mesgar, Mohsen and
Lo{\'a}iciga, Sharid",
booktitle = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Tutorial Abstracts",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.eacl-tutorials.4",
pages = "21--26",
abstract = "Transformers have emerged as dominant play- ers in various scientific fields, especially NLP. However, their inner workings, like many other neural networks, remain opaque. In spite of the widespread use of model-agnostic interpretability techniques, including gradient-based and occlusion-based, their shortcomings are becoming increasingly apparent for Transformer interpretation, making the field of interpretability more demanding today. In this tutorial, we will present Transformer-specific interpretability methods, a new trending approach, that make use of specific features of the Transformer architecture and are deemed more promising for understanding Transformer-based models. We start by discussing the potential pitfalls and misleading results model-agnostic approaches may produce when interpreting Transformers. Next, we discuss Transformer-specific methods, including those designed to quantify context- mixing interactions among all input pairs (as the fundamental property of the Transformer architecture) and those that combine causal methods with low-level Transformer analysis to identify particular subnetworks within a model that are responsible for specific tasks. By the end of the tutorial, we hope participants will understand the advantages (as well as current limitations) of Transformer-specific interpretability methods, along with how these can be applied to their own research.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mohebbi-etal-2024-transformer">
<titleInfo>
<title>Transformer-specific Interpretability</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hosein</namePart>
<namePart type="family">Mohebbi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jaap</namePart>
<namePart type="family">Jumelet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Hanna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Afra</namePart>
<namePart type="family">Alishahi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Willem</namePart>
<namePart type="family">Zuidema</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Tutorial Abstracts</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mohsen</namePart>
<namePart type="family">Mesgar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sharid</namePart>
<namePart type="family">Loáiciga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Transformers have emerged as dominant play- ers in various scientific fields, especially NLP. However, their inner workings, like many other neural networks, remain opaque. In spite of the widespread use of model-agnostic interpretability techniques, including gradient-based and occlusion-based, their shortcomings are becoming increasingly apparent for Transformer interpretation, making the field of interpretability more demanding today. In this tutorial, we will present Transformer-specific interpretability methods, a new trending approach, that make use of specific features of the Transformer architecture and are deemed more promising for understanding Transformer-based models. We start by discussing the potential pitfalls and misleading results model-agnostic approaches may produce when interpreting Transformers. Next, we discuss Transformer-specific methods, including those designed to quantify context- mixing interactions among all input pairs (as the fundamental property of the Transformer architecture) and those that combine causal methods with low-level Transformer analysis to identify particular subnetworks within a model that are responsible for specific tasks. By the end of the tutorial, we hope participants will understand the advantages (as well as current limitations) of Transformer-specific interpretability methods, along with how these can be applied to their own research.</abstract>
<identifier type="citekey">mohebbi-etal-2024-transformer</identifier>
<location>
<url>https://aclanthology.org/2024.eacl-tutorials.4</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>21</start>
<end>26</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Transformer-specific Interpretability
%A Mohebbi, Hosein
%A Jumelet, Jaap
%A Hanna, Michael
%A Alishahi, Afra
%A Zuidema, Willem
%Y Mesgar, Mohsen
%Y Loáiciga, Sharid
%S Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Tutorial Abstracts
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F mohebbi-etal-2024-transformer
%X Transformers have emerged as dominant play- ers in various scientific fields, especially NLP. However, their inner workings, like many other neural networks, remain opaque. In spite of the widespread use of model-agnostic interpretability techniques, including gradient-based and occlusion-based, their shortcomings are becoming increasingly apparent for Transformer interpretation, making the field of interpretability more demanding today. In this tutorial, we will present Transformer-specific interpretability methods, a new trending approach, that make use of specific features of the Transformer architecture and are deemed more promising for understanding Transformer-based models. We start by discussing the potential pitfalls and misleading results model-agnostic approaches may produce when interpreting Transformers. Next, we discuss Transformer-specific methods, including those designed to quantify context- mixing interactions among all input pairs (as the fundamental property of the Transformer architecture) and those that combine causal methods with low-level Transformer analysis to identify particular subnetworks within a model that are responsible for specific tasks. By the end of the tutorial, we hope participants will understand the advantages (as well as current limitations) of Transformer-specific interpretability methods, along with how these can be applied to their own research.
%U https://aclanthology.org/2024.eacl-tutorials.4
%P 21-26
Markdown (Informal)
[Transformer-specific Interpretability](https://aclanthology.org/2024.eacl-tutorials.4) (Mohebbi et al., EACL 2024)
ACL
- Hosein Mohebbi, Jaap Jumelet, Michael Hanna, Afra Alishahi, and Willem Zuidema. 2024. Transformer-specific Interpretability. In Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Tutorial Abstracts, pages 21–26, St. Julian’s, Malta. Association for Computational Linguistics.