{"id":1145339,"date":"2025-07-19T04:43:47","date_gmt":"2025-07-19T11:43:47","guid":{"rendered":"https:\/\/www.microsoft.com\/en-us\/research\/?post_type=msr-research-item&#038;p=1145339"},"modified":"2025-07-20T01:34:20","modified_gmt":"2025-07-20T08:34:20","slug":"large-language-models-can-provide-accurate-and-interpretable-incident-triage-2","status":"publish","type":"msr-research-item","link":"https:\/\/www.microsoft.com\/en-us\/research\/publication\/large-language-models-can-provide-accurate-and-interpretable-incident-triage-2\/","title":{"rendered":"Large Language Models Can Provide Accurate and Interpretable Incident Triage"},"content":{"rendered":"<p>Large-scale cloud services frequently experience incidents that can have a significant impact on their stability. Incident triage is a critical process that assigns incidents to dedicated teams for resolution. However, traditional rule-based methods, commonly employed in various systems, have limitations due to a finite set of rules that necessitate continuous updates, leading to suboptimal performance. Current state-of-the-art approaches primarily rely on textual information, utilizing classifiers or unsupervised clustering. Unfortunately, the abundance of textual information, combined with considerable noise, presents a significant challenge to the accuracy of these methods. To tackle these challenges, we introduce COMET, an innovative system that utilizes an AutoExtractor to filter out non-critical logs and employs a Large Language Model (LLM) for keyword extraction. This approach effectively mitigates the complexity arising from disordered textual information. Additionally, COMET incorporates significant domain knowledge during keyword extraction, enhancing the LLM\u2019s comprehension of the text. We deployed COMET on multiple cloud services within Microsoft, where it has operated continuously for over six months. Offline and online evaluations have shown that COMET achieves enhanced accuracy and reduced Time to Mitigation (TTM).<\/p>\n","protected":false},"excerpt":{"rendered":"<p>Large-scale cloud services frequently experience incidents that can have a significant impact on their stability. Incident triage is a critical process that assigns incidents to dedicated teams for resolution. However, traditional rule-based methods, commonly employed in various systems, have limitations due to a finite set of rules that necessitate continuous updates, leading to suboptimal performance. [&hellip;]<\/p>\n","protected":false},"featured_media":0,"template":"","meta":{"msr-url-field":"","msr-podcast-episode":"","msrModifiedDate":"","msrModifiedDateEnabled":false,"ep_exclude_from_search":false,"_classifai_error":"","msr-author-ordering":null,"msr_publishername":"","msr_publisher_other":"","msr_booktitle":"","msr_chapter":"","msr_edition":"","msr_editors":"","msr_how_published":"","msr_isbn":"","msr_issue":"","msr_journal":"","msr_number":"","msr_organization":"","msr_pages_string":"","msr_page_range_start":"523","msr_page_range_end":"534","msr_series":"","msr_volume":"","msr_copyright":"","msr_conference_name":"","msr_doi":"","msr_arxiv_id":"","msr_s2_paper_id":"","msr_mag_id":"","msr_pubmed_id":"","msr_other_authors":"","msr_other_contributors":"","msr_speaker":"","msr_award":"","msr_affiliation":"","msr_institution":"","msr_host":"","msr_version":"","msr_duration":"","msr_original_fields_of_study":null,"msr_release_tracker_id":"","msr_s2_match_type":"","msr_citation_count_updated":"","msr_published_date":"2024-10-28","msr_highlight_text":"","msr_notes":"","msr_longbiography":"","msr_publicationurl":"","msr_external_url":"","msr_secondary_video_url":"","msr_conference_url":"","msr_journal_url":"","msr_s2_pdf_url":"","msr_year":0,"msr_citation_count":0,"msr_influential_citations":0,"msr_reference_count":0,"msr_s2_match_confidence":0,"msr_microsoftintellectualproperty":true,"msr_s2_open_access":false,"msr_s2_author_ids":[],"msr_pub_ids":[],"msr_hide_image_in_river":null,"footnotes":""},"msr-research-highlight":[],"research-area":[13556,13560],"msr-publication-type":[193715],"msr-publisher":[],"msr-focus-area":[],"msr-locale":[268875],"msr-post-option":[269148,269142],"msr-field-of-study":[268041,246694,247555,251365],"msr-conference":[],"msr-journal":[270037],"msr-impact-theme":[],"msr-pillar":[],"class_list":["post-1145339","msr-research-item","type-msr-research-item","status-publish","hentry","msr-research-area-artificial-intelligence","msr-research-area-programming-languages-software-engineering","msr-locale-en_us","msr-post-option-approved-for-river","msr-post-option-include-in-river","msr-field-of-study-aiops","msr-field-of-study-artificial-intelligence","msr-field-of-study-cloud-computing","msr-field-of-study-software-engineering"],"msr_publishername":"","msr_edition":"","msr_affiliation":"","msr_published_date":"2024-10-28","msr_host":"","msr_duration":"","msr_version":"","msr_speaker":"","msr_other_contributors":"","msr_booktitle":"","msr_pages_string":"","msr_chapter":"","msr_isbn":"","msr_journal":"","msr_volume":"","msr_number":"","msr_editors":"","msr_series":"","msr_issue":"","msr_organization":"","msr_how_published":"","msr_notes":"","msr_highlight_text":"","msr_release_tracker_id":"","msr_original_fields_of_study":"","msr_download_urls":"","msr_external_url":"","msr_secondary_video_url":"","msr_longbiography":"","msr_microsoftintellectualproperty":1,"msr_main_download":"","msr_publicationurl":"","msr_doi":"","msr_publication_uploader":[{"type":"doi","viewUrl":"false","id":"false","title":"https:\/\/doi.org\/10.1109\/ISSRE62328.2024.00056","label_id":"243106","label":0},{"type":"url","viewUrl":"false","id":"false","title":"https:\/\/dblp.org\/rec\/conf\/issre\/WangLML0ZBCRL0P24.html","label_id":"243109","label":0}],"msr_related_uploader":"","msr_citation_count":0,"msr_citation_count_updated":"","msr_s2_paper_id":"","msr_influential_citations":0,"msr_reference_count":0,"msr_arxiv_id":"","msr_s2_author_ids":[],"msr_s2_open_access":false,"msr_s2_pdf_url":null,"msr_attachments":[],"msr-author-ordering":[{"type":"text","value":"Zexin Wang","user_id":0,"rest_url":false},{"type":"text","value":"Jianhui Li","user_id":0,"rest_url":false},{"type":"text","value":"Minghua Ma","user_id":0,"rest_url":false},{"type":"text","value":"Ze Li","user_id":0,"rest_url":false},{"type":"user_nicename","value":"Yu Kang","user_id":39381,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Yu Kang"},{"type":"user_nicename","value":"Chaoyun Zhang","user_id":42387,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Chaoyun Zhang"},{"type":"text","value":"Chetan Bansal","user_id":0,"rest_url":false},{"type":"text","value":"Murali Chintalapati","user_id":0,"rest_url":false},{"type":"text","value":"S. Rajmohan","user_id":0,"rest_url":false},{"type":"text","value":"Qingwei Lin","user_id":0,"rest_url":false},{"type":"text","value":"Dongmei Zhang","user_id":0,"rest_url":false},{"type":"text","value":"Changhua Pei","user_id":0,"rest_url":false},{"type":"text","value":"Gaogang Xie","user_id":0,"rest_url":false}],"msr_impact_theme":[],"msr_research_lab":[],"msr_event":[],"msr_group":[],"msr_project":[855579],"publication":[],"video":[],"msr-tool":[],"msr_publication_type":"article","related_content":{"projects":[{"ID":855579,"post_title":"AIOps","post_name":"aiops","post_type":"msr-project","post_date":"2022-06-24 04:09:36","post_modified":"2022-10-25 05:28:06","post_status":"publish","permalink":"https:\/\/www.microsoft.com\/en-us\/research\/project\/aiops\/","post_excerpt":"In the past fifteen years, the most significant paradigm shift in the computing industry is the migration to cloud computing, which brings unprecedented opportunities of digital transformation to business, society, and human life. The implication of this is profound. It means that cloud computing platforms have become part of the basic infrastructure of the world. Therefore, the non-functional properties of cloud computing platforms, including availability, reliability, performance, efficiency, security, sustainability, etc., become immensely important. The&hellip;","_links":{"self":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-project\/855579"}]}}]},"_links":{"self":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/1145339","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item"}],"about":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/types\/msr-research-item"}],"version-history":[{"count":1,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/1145339\/revisions"}],"predecessor-version":[{"id":1145340,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/1145339\/revisions\/1145340"}],"wp:attachment":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/media?parent=1145339"}],"wp:term":[{"taxonomy":"msr-research-highlight","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-highlight?post=1145339"},{"taxonomy":"msr-research-area","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/research-area?post=1145339"},{"taxonomy":"msr-publication-type","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-publication-type?post=1145339"},{"taxonomy":"msr-publisher","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-publisher?post=1145339"},{"taxonomy":"msr-focus-area","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-focus-area?post=1145339"},{"taxonomy":"msr-locale","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-locale?post=1145339"},{"taxonomy":"msr-post-option","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-post-option?post=1145339"},{"taxonomy":"msr-field-of-study","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-field-of-study?post=1145339"},{"taxonomy":"msr-conference","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-conference?post=1145339"},{"taxonomy":"msr-journal","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-journal?post=1145339"},{"taxonomy":"msr-impact-theme","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-impact-theme?post=1145339"},{"taxonomy":"msr-pillar","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-pillar?post=1145339"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}