@inproceedings{7a5c4b9fea5e4ca3a45d6498839fef92,
title = "Hardware-aware Network Compression for Hybrid Vision Transformer via Low-Rank Approximation",
abstract = "In recent years, transformer-based models have achieved excellent performance in various fields such as computer vision and language processing. Specifically, vision transformer (ViT) models outperform conventional convolutional neural networks (CNNs) in image classification tasks by achieving higher accuracy. However, ViT-based models often require more parameters than CNNs, making efficient deployment challenging in memory-constrained environments such as mobile devices. For example, the peak memory required in the output header layer of EfficientViT was 39.32 Mb. Deploying such a layer on the Zynq-7000 XC7Z045 FPGA board requires off-chip memory access, leading to inefficient power consumption. To address these issues, we applied a low-rank approximation method to reduce the memory requirements of the EfficientViT-B1 model. Our proposed method using the EfficientViT-B1 model on the ImageNet dataset achieved performance with only a 0.43\% accuracy drop without requiring DRAM access.",
keywords = "Computer Vision, Deep Learning, FPGA, Network Compression, Vision Transformer",
author = "Kang, \{Beom Jin\} and Kim, \{Nam Joon\} and Hyun Kim",
note = "Publisher Copyright: {\textcopyright} 2024 IEEE.; 21st International System-on-Chip Design Conference, ISOCC 2024 ; Conference date: 19-08-2024 Through 22-08-2024",
year = "2024",
doi = "10.1109/ISOCC62682.2024.10762117",
language = "English",
series = "Proceedings - International SoC Design Conference 2024, ISOCC 2024",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "171--172",
booktitle = "Proceedings - International SoC Design Conference 2024, ISOCC 2024",
}