{"id":1043658,"date":"2024-06-05T10:28:58","date_gmt":"2024-06-05T17:28:58","guid":{"rendered":"https:\/\/www.microsoft.com\/en-us\/research\/?post_type=msr-research-item&#038;p=1043658"},"modified":"2024-06-19T17:32:47","modified_gmt":"2024-06-20T00:32:47","slug":"dejavu-kv-cache-streaming-for-fast-fault-tolerant-generative-llm-serving","status":"publish","type":"msr-research-item","link":"https:\/\/www.microsoft.com\/en-us\/research\/publication\/dejavu-kv-cache-streaming-for-fast-fault-tolerant-generative-llm-serving\/","title":{"rendered":"D\u00e9j\u00e0Vu: KV-cache Streaming for Fast, Fault-tolerant Generative LLM Serving"},"content":{"rendered":"<p>Distributed LLM serving is costly and often underutilizes hardware accelerators due to three key challenges: bubbles in pipeline-parallel deployments caused by the bimodal latency of prompt and token processing, GPU memory overprovisioning, and long recovery times in case of failures. In this paper, we propose D\u00e9j\u00e0Vu, a system to address all these challenges using a versatile and efficient KV cache streaming library (D\u00e9j\u00e0VuLib). Using D\u00e9j\u00e0VuLib, we propose and implement efficient prompt-token disaggregation to reduce pipeline bubbles, microbatch swapping for efficient GPU memory management, and state replication for fault-tolerance. We highlight the efficacy of these solutions on a range of large models across cloud deployments.<\/p>\n","protected":false},"excerpt":{"rendered":"<p>Distributed LLM serving is costly and often underutilizes hardware accelerators due to three key challenges: bubbles in pipeline-parallel deployments caused by the bimodal latency of prompt and token processing, GPU memory overprovisioning, and long recovery times in case of failures. In this paper, we propose D\u00e9j\u00e0Vu, a system to address all these challenges using a [&hellip;]<\/p>\n","protected":false},"featured_media":0,"template":"","meta":{"msr-url-field":"","msr-podcast-episode":"","msrModifiedDate":"","msrModifiedDateEnabled":false,"ep_exclude_from_search":false,"_classifai_error":"","msr-author-ordering":null,"msr_publishername":"","msr_publisher_other":"","msr_booktitle":"","msr_chapter":"","msr_edition":"","msr_editors":"","msr_how_published":"","msr_isbn":"","msr_issue":"","msr_journal":"","msr_number":"","msr_organization":"","msr_pages_string":"","msr_page_range_start":"","msr_page_range_end":"","msr_series":"","msr_volume":"","msr_copyright":"","msr_conference_name":"ICML 2024","msr_doi":"","msr_arxiv_id":"","msr_s2_paper_id":"","msr_mag_id":"","msr_pubmed_id":"","msr_other_authors":"","msr_other_contributors":"","msr_speaker":"","msr_award":"","msr_affiliation":"","msr_institution":"","msr_host":"","msr_version":"","msr_duration":"","msr_original_fields_of_study":"","msr_release_tracker_id":"","msr_s2_match_type":"","msr_citation_count_updated":"","msr_published_date":"2024-7-22","msr_highlight_text":"","msr_notes":"","msr_longbiography":"","msr_publicationurl":"","msr_external_url":"","msr_secondary_video_url":"","msr_conference_url":"https:\/\/icml.cc\/","msr_journal_url":"","msr_s2_pdf_url":"","msr_year":0,"msr_citation_count":0,"msr_influential_citations":0,"msr_reference_count":0,"msr_s2_match_confidence":0,"msr_microsoftintellectualproperty":true,"msr_s2_open_access":false,"msr_s2_author_ids":[],"msr_pub_ids":[],"msr_hide_image_in_river":0,"footnotes":""},"msr-research-highlight":[],"research-area":[13556,13547],"msr-publication-type":[193716],"msr-publisher":[],"msr-focus-area":[],"msr-locale":[268875],"msr-post-option":[],"msr-field-of-study":[268089],"msr-conference":[260284],"msr-journal":[],"msr-impact-theme":[],"msr-pillar":[],"class_list":["post-1043658","msr-research-item","type-msr-research-item","status-publish","hentry","msr-research-area-artificial-intelligence","msr-research-area-systems-and-networking","msr-locale-en_us","msr-field-of-study-large-language-models"],"msr_publishername":"","msr_edition":"","msr_affiliation":"","msr_published_date":"2024-7-22","msr_host":"","msr_duration":"","msr_version":"","msr_speaker":"","msr_other_contributors":"","msr_booktitle":"","msr_pages_string":"","msr_chapter":"","msr_isbn":"","msr_journal":"","msr_volume":"","msr_number":"","msr_editors":"","msr_series":"","msr_issue":"","msr_organization":"","msr_how_published":"","msr_notes":"","msr_highlight_text":"","msr_release_tracker_id":"","msr_original_fields_of_study":"","msr_download_urls":"","msr_external_url":"","msr_secondary_video_url":"","msr_longbiography":"","msr_microsoftintellectualproperty":1,"msr_main_download":"","msr_publicationurl":"","msr_doi":"","msr_publication_uploader":[{"type":"doi","viewUrl":"false","id":"false","title":"https:\/\/doi.org\/10.48550\/arXiv.2403.01876","label_id":"243106","label":0},{"type":"url","viewUrl":"false","id":"false","title":"https:\/\/arxiv.org\/pdf\/2403.01876.pdf","label_id":"243132","label":0},{"type":"url","viewUrl":"false","id":"false","title":"https:\/\/dblp.org\/rec\/journals\/corr\/abs-2403-01876.html","label_id":"243109","label":0},{"type":"file","viewUrl":"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/06\/7085_d_j_vu_kv_cache_streaming_for_.pdf","id":"1048716","title":"7085_d_j_vu_kv_cache_streaming_for_","label_id":"243132","label":0}],"msr_related_uploader":"","msr_citation_count":0,"msr_citation_count_updated":"","msr_s2_paper_id":"","msr_influential_citations":0,"msr_reference_count":0,"msr_arxiv_id":"","msr_s2_author_ids":[],"msr_s2_open_access":false,"msr_s2_pdf_url":null,"msr_attachments":[{"id":1048716,"url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/06\/7085_d_j_vu_kv_cache_streaming_for_.pdf"}],"msr-author-ordering":[{"type":"text","value":"Foteini Strati","user_id":0,"rest_url":false},{"type":"text","value":"Sara Mcallister","user_id":0,"rest_url":false},{"type":"text","value":"Amar Phanishayee","user_id":0,"rest_url":false},{"type":"user_nicename","value":"Jakub Tarnawski","user_id":38820,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Jakub Tarnawski"},{"type":"text","value":"Ana Klimovic","user_id":0,"rest_url":false}],"msr_impact_theme":[],"msr_research_lab":[199565],"msr_event":[1038558],"msr_group":[144927,437022],"msr_project":[472845],"publication":[],"video":[],"msr-tool":[],"msr_publication_type":"inproceedings","related_content":{"projects":[{"ID":472845,"post_title":"Project Fiddle","post_name":"fiddle","post_type":"msr-project","post_date":"2018-03-20 10:20:05","post_modified":"2025-02-06 11:06:39","post_status":"publish","permalink":"https:\/\/www.microsoft.com\/en-us\/research\/project\/fiddle\/","post_excerpt":"The goal of Project Fiddle is to build efficient systems infrastructure for very fast distributed DNN training. Our goal is to support 100x more efficient training. Our innovations cut across the systems stack: the memory subsystem, structuring parallel computation across GPUs and machines, and interconnects between GPUs and across machines.","_links":{"self":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-project\/472845"}]}}]},"_links":{"self":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/1043658","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item"}],"about":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/types\/msr-research-item"}],"version-history":[{"count":1,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/1043658\/revisions"}],"predecessor-version":[{"id":1043679,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/1043658\/revisions\/1043679"}],"wp:attachment":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/media?parent=1043658"}],"wp:term":[{"taxonomy":"msr-research-highlight","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-highlight?post=1043658"},{"taxonomy":"msr-research-area","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/research-area?post=1043658"},{"taxonomy":"msr-publication-type","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-publication-type?post=1043658"},{"taxonomy":"msr-publisher","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-publisher?post=1043658"},{"taxonomy":"msr-focus-area","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-focus-area?post=1043658"},{"taxonomy":"msr-locale","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-locale?post=1043658"},{"taxonomy":"msr-post-option","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-post-option?post=1043658"},{"taxonomy":"msr-field-of-study","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-field-of-study?post=1043658"},{"taxonomy":"msr-conference","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-conference?post=1043658"},{"taxonomy":"msr-journal","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-journal?post=1043658"},{"taxonomy":"msr-impact-theme","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-impact-theme?post=1043658"},{"taxonomy":"msr-pillar","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-pillar?post=1043658"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}