@inproceedings{2e44b9c2868d4e88a7e169523d96fb91,
title = "TRE-Map: Towards reducing the overheads of fault-aware retraining of deep neural networks by merging fault maps",
abstract = "Recently, fault-aware retraining has emerged as a promising approach to improve the error resilience of Deep Neural Networks (DNNs) against manufacturing-induced defects in DNN accelerators. However, state-of-the-art fault-aware training techniques incur a gigantic retraining overhead due to their per-chip retraining nature for the chip{\textquoteright}s unique fault map, which may render it practically infeasible if retraining is done on large datasets. To address this major limitation and improve the practicability of the fault-aware retraining methodology, this work proposes a novel concept of merging fault maps to effectively retrain a DNN for a group of faulty chips in a single fault-aware retraining round. The merging of fault maps enables to avoid per chip retraining and thereby reduces the retraining overhead significantly. However, the merging of fault maps brings in new challenges such as training divergence (accuracy collapse) if a high number of accumulated faults are injected into the network in the first epoch. To address these challenges, we propose a methodology for effective merging of fault maps and then retraining of DNNs. Experimental results show that our methodology offers at least 1.4x retraining speedup on average while improving the error resilience of the network (depending on the DNN models and the number of merged fault maps). For example, for the Resnet-32 model using fault map generated from 5 fault maps at the fault rate 6e-3, our methodology offers 2x retraining speedup and 0.6% classification accuracy drop against per-chip retraining.",
keywords = "DNN accelerator, Deep neural networks, Fault maps, Manufacturing defects, Reliability, Resilience, SRAM",
author = "Hoang, {Le Ha} and Hanif, {Muhammad Abdullah} and Muhammad Shafique",
note = "Funding Information: This work is partially supported by Intel Corporation through Gift funding for the project ”Cost-Effective Dependability for Deep Neural Networks and Spiking Neural Networks.” Publisher Copyright: {\textcopyright} 2021 IEEE.; 24th Euromicro Conference on Digital System Design, DSD 2021 ; Conference date: 01-09-2021 Through 03-09-2021",
year = "2021",
doi = "10.1109/DSD53832.2021.00072",
language = "English (US)",
series = "Proceedings - 2021 24th Euromicro Conference on Digital System Design, DSD 2021",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "434--441",
editor = "Francesco Leporati and Salvatore Vitabile and Amund Skavhaug",
booktitle = "Proceedings - 2021 24th Euromicro Conference on Digital System Design, DSD 2021",
}