@inproceedings{abulaish-gulia-2022-compact,
title = "Compact Residual Learning with Frequency-Based Non-Square Kernels for Small Footprint Keyword Spotting",
author = "Abulaish, Muhammad and
Gulia, Rahul",
editor = "Akhtar, Md. Shad and
Chakraborty, Tanmoy",
booktitle = "Proceedings of the 19th International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2022",
address = "New Delhi, India",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.icon-main.39",
pages = "328--336",
abstract = "Enabling voice assistants on small embedded devices requires a keyword spotter with a smaller model size and adequate accuracy. It becomes difficult to achieve a reasonable trade-off between a small footprint and high accuracy. Recent studies have demonstrated that convolution neural networks are also effective in the audio domain. In this paper, taking into account the nature of spectrograms, we propose a compact ResNet architecture that uses frequency-based non-square kernels to extract the maximum number of timbral features for keyword spotting. The proposed architecture is approximately three-and-a-half times smaller than a comparable architecture with conventional square kernels. On the Google{'}s speech command dataset v1, it outperforms both Google{'}s convolution neural networks and the equivalent ResNet architecture with square kernels. By implementing non-square kernels for spectrogram-related data, we can achieve a significant increase in accuracy with relatively few parameters, as compared to the conventional square kernels that are the default choice for every problem.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abulaish-gulia-2022-compact">
<titleInfo>
<title>Compact Residual Learning with Frequency-Based Non-Square Kernels for Small Footprint Keyword Spotting</title>
</titleInfo>
<name type="personal">
<namePart type="given">Muhammad</namePart>
<namePart type="family">Abulaish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahul</namePart>
<namePart type="family">Gulia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th International Conference on Natural Language Processing (ICON)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Md.</namePart>
<namePart type="given">Shad</namePart>
<namePart type="family">Akhtar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">New Delhi, India</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Enabling voice assistants on small embedded devices requires a keyword spotter with a smaller model size and adequate accuracy. It becomes difficult to achieve a reasonable trade-off between a small footprint and high accuracy. Recent studies have demonstrated that convolution neural networks are also effective in the audio domain. In this paper, taking into account the nature of spectrograms, we propose a compact ResNet architecture that uses frequency-based non-square kernels to extract the maximum number of timbral features for keyword spotting. The proposed architecture is approximately three-and-a-half times smaller than a comparable architecture with conventional square kernels. On the Google’s speech command dataset v1, it outperforms both Google’s convolution neural networks and the equivalent ResNet architecture with square kernels. By implementing non-square kernels for spectrogram-related data, we can achieve a significant increase in accuracy with relatively few parameters, as compared to the conventional square kernels that are the default choice for every problem.</abstract>
<identifier type="citekey">abulaish-gulia-2022-compact</identifier>
<location>
<url>https://aclanthology.org/2022.icon-main.39</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>328</start>
<end>336</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Compact Residual Learning with Frequency-Based Non-Square Kernels for Small Footprint Keyword Spotting
%A Abulaish, Muhammad
%A Gulia, Rahul
%Y Akhtar, Md. Shad
%Y Chakraborty, Tanmoy
%S Proceedings of the 19th International Conference on Natural Language Processing (ICON)
%D 2022
%8 December
%I Association for Computational Linguistics
%C New Delhi, India
%F abulaish-gulia-2022-compact
%X Enabling voice assistants on small embedded devices requires a keyword spotter with a smaller model size and adequate accuracy. It becomes difficult to achieve a reasonable trade-off between a small footprint and high accuracy. Recent studies have demonstrated that convolution neural networks are also effective in the audio domain. In this paper, taking into account the nature of spectrograms, we propose a compact ResNet architecture that uses frequency-based non-square kernels to extract the maximum number of timbral features for keyword spotting. The proposed architecture is approximately three-and-a-half times smaller than a comparable architecture with conventional square kernels. On the Google’s speech command dataset v1, it outperforms both Google’s convolution neural networks and the equivalent ResNet architecture with square kernels. By implementing non-square kernels for spectrogram-related data, we can achieve a significant increase in accuracy with relatively few parameters, as compared to the conventional square kernels that are the default choice for every problem.
%U https://aclanthology.org/2022.icon-main.39
%P 328-336
Markdown (Informal)
[Compact Residual Learning with Frequency-Based Non-Square Kernels for Small Footprint Keyword Spotting](https://aclanthology.org/2022.icon-main.39) (Abulaish & Gulia, ICON 2022)
ACL