@inproceedings{081efbad05a5446c9c2875880b5d3431,
title = "sLLM: Accelerating LLM Inference using Semantic Load Balancing with Shared Memory Data Structures",
abstract = "As Large Language Models (LLMs) are increasingly deployed to support a broad spectrum of applications, enhancing inference efficiency and minimizing costs have become critical areas of focus. To address these challenges, researchers have explored optimizing the Key-Value (KV) cache within LLMs. However, existing approaches have not considered the potential benefits of sharing KV caches across multiple requests in a cluster environment. Addressing this gap, we introduce sLLM, a novel system that integrates an efficient shared-memory-based Semantic Load Balancer with a KV cache sharing mechanism. This design significantly reduces the need for recomputation during LLM inference, which enhances inference performance. Our evaluation of the sLLM system showcases its effectiveness: the Semantic Load Balancer achieves up to a 7× reduction in latency when dispatching requests, while the system as a whole can decrease the Time-To-First-Token (TTFT) for LLM inferences by 30 - 58%.",
keywords = "component, formatting, insert, style, styling",
author = "Jieyu Lin and Zhang, {Sai Qian} and Alberto Leon-Garcia",
note = "Publisher Copyright: {\textcopyright} 2024 IEEE.; 25th International Symposium on Quality Electronic Design, ISQED 2024 ; Conference date: 03-04-2024 Through 05-04-2024",
year = "2024",
doi = "10.1109/ISQED60706.2024.10528703",
language = "English (US)",
series = "Proceedings - International Symposium on Quality Electronic Design, ISQED",
publisher = "IEEE Computer Society",
booktitle = "Proceedings of the 25th International Symposium on Quality Electronic Design, ISQED 2024",
}