{"id":999708,"date":"2024-01-30T05:14:41","date_gmt":"2024-01-30T13:14:41","guid":{"rendered":"https:\/\/www.microsoft.com\/en-us\/research\/?post_type=msr-blog-post&#038;p=999708"},"modified":"2024-06-10T10:08:58","modified_gmt":"2024-06-10T17:08:58","slug":"improving-reasoning-in-language-models-with-laser-layer-selective-rank-reduction","status":"publish","type":"msr-blog-post","link":"https:\/\/www.microsoft.com\/en-us\/research\/articles\/improving-reasoning-in-language-models-with-laser-layer-selective-rank-reduction\/","title":{"rendered":"Improving Reasoning in Language Models with LASER: Layer-Selective Rank Reduction"},"content":{"rendered":"\n<p class=\"has-purple-color has-text-color has-link-color wp-elements-3225103e130a42d6793d9713f1122c2b\"><em>Presented by <a href=\"https:\/\/www.microsoft.com\/en-us\/research\/people\/dimisra\/\" target=\"_blank\" rel=\"noreferrer noopener\">Dipendra Misra<\/a> at <strong>Microsoft Research Forum, January 2024<\/strong><\/em><\/p>\n\n\n\n<div class=\"wp-block-media-text has-vertical-margin-none  has-vertical-padding-none  is-stacked-on-mobile has-white-background-color has-background\" style=\"grid-template-columns:25% auto\"><figure class=\"wp-block-media-text__media\"><img loading=\"lazy\" decoding=\"async\" width=\"360\" height=\"360\" src=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Dipendra-Misra_360x360.jpg\" alt=\"Dipendra Misra\" class=\"wp-image-1003242 size-full\" srcset=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Dipendra-Misra_360x360.jpg 360w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Dipendra-Misra_360x360-300x300.jpg 300w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Dipendra-Misra_360x360-150x150.jpg 150w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Dipendra-Misra_360x360-180x180.jpg 180w\" sizes=\"auto, (max-width: 360px) 100vw, 360px\" \/><\/figure><div class=\"wp-block-media-text__content\">\n<blockquote class=\"wp-block-quote is-style-spectrum is-layout-flow wp-block-quote-is-layout-flow\">\n<p>\u201cAn LLM is trained on lots of data, often collected from the internet, and uses a model architecture, typically a transformer, to train the model, and they work remarkably well across a range of different tasks. And so one way perhaps we can build towards understanding [an] LLM is by performing interventions in the model and then seeing how that intervention reflects in [its performance].\u201d<\/p>\n<cite><em>\u2013<\/em> Dipendra Misra, Senior Researcher<\/cite><\/blockquote>\n<\/div><\/div>\n\n\n\n<figure class=\"wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio\"><div class=\"wp-block-embed__wrapper\">\n<iframe loading=\"lazy\" title=\"Improving Reasoning in Language Models with LASER: Layer-Selective Rank Reduction\" width=\"500\" height=\"281\" src=\"https:\/\/www.youtube-nocookie.com\/embed\/MGS6PiZhQgI?feature=oembed&rel=0\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen><\/iframe>\n<\/div><\/figure>\n\n\n\n<div class=\"annotations \" data-bi-aN=\"margin-callout\">\n\t<article class=\"annotations__list card depth-16 bg-body p-4 annotations__list--right\">\n\t\t<div class=\"annotations__list-item\">\n\t\t\t\t\t\t\t<a href=\"https:\/\/msrchat.azurewebsites.net\/?askmsr=Summarize%20the%20main%20three%20points%20of%20Dipendra%27s%20talk\" target=\"_blank\" aria-label=\"Summarize the main three points of Dipendra's talk\" data-bi-type=\"annotated-link\" data-bi-cN=\"Summarize the main three points of Dipendra's talk\" class=\"annotations__list-thumbnail\" >\n\t\t\t\t\t<img loading=\"lazy\" decoding=\"async\" width=\"172\" height=\"96\" src=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-240x135.png\" class=\"mb-2\" alt=\"Ask Microsoft research copilot experience\" srcset=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-240x135.png 240w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-300x169.png 300w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-1024x576.png 1024w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-768x432.png 768w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-1066x600.png 1066w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-655x368.png 655w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-343x193.png 343w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-640x360.png 640w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-960x540.png 960w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo-1280x720.png 1280w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/MSR-Chat-Promo.png 1400w\" sizes=\"auto, (max-width: 172px) 100vw, 172px\" \/>\t\t\t\t<\/a>\n\t\t\t\t\t\t\t<span class=\"annotations__type d-block text-uppercase font-weight-semibold text-neutral-300 small\">Microsoft research copilot experience<\/span>\n\t\t\t<a href=\"https:\/\/msrchat.azurewebsites.net\/?askmsr=Summarize%20the%20main%20three%20points%20of%20Dipendra%27s%20talk\" data-bi-cN=\"Summarize the main three points of Dipendra's talk\" target=\"_blank\" rel=\"noopener noreferrer\" data-external-link=\"true\" data-bi-aN=\"margin-callout\" data-bi-type=\"annotated-link\" class=\"annotations__link font-weight-semibold text-decoration-none\"><span>Summarize the main three points of Dipendra's talk<\/span>&nbsp;<span class=\"glyph-in-link glyph-append glyph-append-open-in-new-tab\" aria-hidden=\"true\"><\/span><\/a>\t\t\t\t\t<\/div>\n\t<\/article>\n<\/div>\n\n\n<div class=\"wp-block-msr-show-more\">\n\t<div class=\"bg-neutral-100 p-5\">\n\t\t<div class=\"show-more-show-less\">\n\t\t\t<div>\n\t\t\t\t<span>\n\t\t\t\t\t\n\n<h3 class=\"wp-block-heading\" id=\"transcript\">Transcript<\/h3>\n\n\n\n<p><strong>Dipendra Misra<\/strong>, Senior Researcher, Microsoft Research NYC and AI Frontiers<\/p>\n\n\n\n<p>Dipendra Misra will present a surprising discovery that by merely replacing selected weight matrices in an LLM with their suitable low-rank approximation, you can significantly improve the performance of the LLM, at times by 20 to 30 percentage points.<\/p>\n\n\n\n<p><em>Microsoft Research Forum, January 30, 2024<\/em>&nbsp;<\/p>\n\n\n\n<p><strong>DIPENDRA MISRA<\/strong><strong>:<\/strong> Welcome, everyone. I&#8217;m Dipendra Misra, a researcher at Microsoft Research New York City and AI Frontiers, and I&#8217;m excited to be talking about our new method called LASER, which is <em>Layer-Selective Rank<\/em> <em>Reduction<\/em>, an approach for improving pretrained large language models. So large language models, or LLMs, have revolutionized machine learning, and yet there is so little we know about how they work.&nbsp;<\/p>\n\n\n\n\t\t\t\t<\/span>\n\t\t\t\t<span id=\"show-more-show-less-toggle-1\" class=\"show-more-show-less-toggleable-content\">\n\t\t\t\t\t\n\n\n\n<p>So in a summary, an LLM is trained on lots of data, often collected from the internet, and uses a model architecture, typically a transformer, to train the model, and they work remarkably well across a range of different tasks. And so one way perhaps we can build towards the understanding of LLM is by performing intervention in the model and then seeing how that intervention reflects in the performance of the LLM. For example, we may find that performing a certain type of intervention may affect one type of task but not the other. And by this way, we may understand how the information about solving different tasks is stored inside the LLM. So with this motivation in mind, we introduce LASER, which is a type of intervention where we select one of the weight matrices of the LLM and replace it by its low-rank approximation.&nbsp;<\/p>\n\n\n\n<p>So in the bottom over here, we see our transformer architecture. If you&#8217;re not familiar with the details of it, that&#8217;s fine. What we need to know here is that the transformer architecture consists of repeated transformer blocks arranged in different layers, and each block has multiple weight matrices, which are shown here in square. So, for example, here, to perform LASER, we select this weight matrix, which is highlighted in red, and it&#8217;s coming from layer No. 22, and we call it the \\(W\\) matrix here.<\/p>\n\n\n\n<p>And to perform this low-rank approximation, we first use what&#8217;s called a <em>singular value decomposition<\/em>, which decomposes this matrices into three matrices called the \\(U\\), \\(\u03a3\\), and \\(V\\). The \\(\u03a3\\) here contains the singular value of the matrices, and it&#8217;s arranged diagonally in decreasing order. So to perform its lower-rank approximation, we throw away all the information in \\(U\\), \\(\u03a3\\), and \\(V\\), which is \\(not\\) in blue color, and then we multiply the remaining matrix, and we get its low-rank approximation, which is shown in \\(W_{lr}\\). And this is a very computationally efficient process and can be done easily with existing libraries.<\/p>\n\n\n\n<p>So in summary, to perform a single LASER intervention, one has to make three choices. So first is which layer to select. Second is which type of weight matrix to edit. And third is how much approximation should be done. In our paper, we also study how these different LASER interventions can be composed across layers and applied simultaneously. So before discussing how to evaluate LASER, I want to mention that LASER also has the advantage of reducing the memory footprint of the model. And this is important because we are living in this age where the memory taken by LLMs is growing at an astonishing pace, and by reducing the memory footprint, we can allow more people to be able to use these LLMs and store them on device.&nbsp;<\/p>\n\n\n\n<p>So for our first evaluation, we evaluate LASER on an existing GPT-J LLM and evaluate on the CounterFact question-answering dataset. The motivation for this is that the GPT-J LLM has its training data available publicly, which allows us to do interesting analysis with it, and the CounterFact question-answering dataset has paraphrases, which allows us to measure robustness to paraphrases.&nbsp;<\/p>\n\n\n\n<p>Now as I mentioned earlier, we are doing intervention using LASER on the LLM, so one would expect that the model loss should go up as we are doing more approximation, meaning that the model is going to perform bad, right, because we are throwing [out] information from an LLM, which is trained on large amounts of data. But to our surprise, what we find [is] that if the right type of LASER intervention is performed, then the model loss doesn&#8217;t go up but actually goes down, meaning that we actually improve the pretrained LLM even more.&nbsp;<\/p>\n\n\n\n<p>So in this figure here, we show what happens when the LASER is applied to the MLP matrices, and we see that if we apply LASER at the earlier layer, then the loss is going up. Here, the orange color or the yellow color shows that we&#8217;re doing less approximation, and black or in blue means we are doing more approximation. So in the lower layer, we can see that the yellow has a lower loss, but the black has a higher loss. But if you apply LASER in the later layers, we see that the loss is actually decreasing as we do more approximation. And this is truly surprising.&nbsp;&nbsp;<\/p>\n\n\n\n<p>So does this hold more generally? So we find that, yes, this does hold across several tasks and in three different LLMs, namely RoBERTa, GPT-J, and Llama 2. And at times, we see surprising gains like 20 to 30 percentage points. For example, on this task of gender prediction using biographies, we see that the performance of GPT-J goes from 70.9 percent to 97.5 percent accuracy. And in our paper, we have more type of analysis. I&#8217;ll just briefly describe two of them quickly.<\/p>\n\n\n\n<p>So one of them shows that if you apply LASER, then the most gains that we get are from improvements in data points which are rarer in the training data. And we also find that the components that the LASER removes from a weight matrices typically offer semantically correct but incorrect responses. And so we can view LASER as a denoising process which is removing this erroneous information.&nbsp;&nbsp;<\/p>\n\n\n\n<p>So in conclusion, we present LASER, which is a new way of doing intervention in large language models, and we show a surprising result that performing LASER can both increase the accuracy of these large language models while also removing the memory footprint. And more details can be found in our paper, which is available on arXiv and will appear as a conference paper at the upcoming ICLR conference.<\/p>\n\n\n\n<p>Thank you.<\/p>\n\n\t\t\t\t<\/span>\n\t\t\t<\/div>\n\t\t\t<button\n\t\t\t\tclass=\"action-trigger glyph-prepend mt-2 mb-0 show-more-show-less-toggle\"\n\t\t\t\taria-expanded=\"false\"\n\t\t\t\tdata-show-less-text=\"Show less\"\n\t\t\t\ttype=\"button\"\n\t\t\t\taria-controls=\"show-more-show-less-toggle-1\"\n\t\t\t\taria-label=\"Show more content\"\n\t\t\t\tdata-alternate-aria-label=\"Show less content\">\n\t\t\t\tShow more\t\t\t<\/button>\n\t\t<\/div>\n\t<\/div>\n<\/div>\n\n\n\n<h3 class=\"wp-block-heading alignwide\" id=\"related-resources\">Related resources<\/h3>\n\n\n\n<div class=\"wp-block-columns alignwide are-vertically-aligned-top is-layout-flex wp-container-core-columns-is-layout-9d6595d7 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-vertically-aligned-top is-layout-flow wp-block-column-is-layout-flow\">\n<div class=\"annotations \" data-bi-aN=\"citation\">\n\t<article class=\"annotations__list card depth-16 bg-body p-4 \">\n\t\t<div class=\"annotations__list-item\">\n\t\t\t\t\t\t<span class=\"annotations__type d-block text-uppercase font-weight-semibold text-neutral-300 small\">Publication<\/span>\n\t\t\t<a href=\"https:\/\/www.microsoft.com\/en-us\/research\/publication\/the-truth-is-in-there-improving-reasoning-in-language-models-with-layer-selective-rank-reduction\/\" data-bi-cN=\"The Truth is in There: Improving Reasoning in Language Models with Layer-Selective Rank Reduction\" data-external-link=\"false\" data-bi-aN=\"citation\" data-bi-type=\"annotated-link\" class=\"annotations__link font-weight-semibold text-decoration-none\"><span>The Truth is in There: Improving Reasoning in Language Models with Layer-Selective Rank Reduction<\/span>&nbsp;<span class=\"glyph-in-link glyph-append glyph-append-chevron-right\" aria-hidden=\"true\"><\/span><\/a>\t\t\t\t\t<\/div>\n\t<\/article>\n<\/div>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-vertically-aligned-top is-layout-flow wp-block-column-is-layout-flow\">\n<div class=\"annotations \" data-bi-aN=\"citation\">\n\t<article class=\"annotations__list card depth-16 bg-body p-4 \">\n\t\t<div class=\"annotations__list-item\">\n\t\t\t\t\t\t<span class=\"annotations__type d-block text-uppercase font-weight-semibold text-neutral-300 small\">Podcast<\/span>\n\t\t\t<a href=\"https:\/\/www.microsoft.com\/en-us\/research\/podcast\/abstracts-january-25-2024\/\" data-bi-cN=\"Abstracts: January 25, 2024\" data-external-link=\"false\" data-bi-aN=\"citation\" data-bi-type=\"annotated-link\" class=\"annotations__link font-weight-semibold text-decoration-none\"><span>Abstracts: January 25, 2024<\/span>&nbsp;<span class=\"glyph-in-link glyph-append glyph-append-chevron-right\" aria-hidden=\"true\"><\/span><\/a>\t\t\t\t\t<\/div>\n\t<\/article>\n<\/div>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-vertically-aligned-top is-layout-flow wp-block-column-is-layout-flow\"><\/div>\n<\/div>\n","protected":false},"excerpt":{"rendered":"<p>Dipendra Misra, Senior Researcher at Microsoft Research New York City and AI Frontiers lightning talk presentation at the Microsoft Research Forum.<\/p>\n","protected":false},"author":42735,"featured_media":1002795,"template":"","meta":{"msr-url-field":"","msr-podcast-episode":"","msrModifiedDate":"","msrModifiedDateEnabled":false,"ep_exclude_from_search":false,"_classifai_error":"","msr-content-parent":999513,"msr_hide_image_in_river":0,"footnotes":""},"research-area":[],"msr-locale":[268875],"msr-post-option":[],"class_list":["post-999708","msr-blog-post","type-msr-blog-post","status-publish","has-post-thumbnail","hentry","msr-locale-en_us"],"msr_assoc_parent":{"id":999513,"type":"story"},"_links":{"self":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-blog-post\/999708","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-blog-post"}],"about":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/types\/msr-blog-post"}],"author":[{"embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/users\/42735"}],"version-history":[{"count":26,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-blog-post\/999708\/revisions"}],"predecessor-version":[{"id":1045116,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-blog-post\/999708\/revisions\/1045116"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/media\/1002795"}],"wp:attachment":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/media?parent=999708"}],"wp:term":[{"taxonomy":"msr-research-area","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/research-area?post=999708"},{"taxonomy":"msr-locale","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-locale?post=999708"},{"taxonomy":"msr-post-option","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-post-option?post=999708"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}