{"id":986898,"date":"2023-11-27T09:58:14","date_gmt":"2023-11-27T17:58:14","guid":{"rendered":"https:\/\/www.microsoft.com\/en-us\/research\/?post_type=msr-research-item&#038;p=986898"},"modified":"2024-04-26T14:02:12","modified_gmt":"2024-04-26T21:02:12","slug":"maira-1-a-specialised-large-multimodal-model-for-radiology-report-generation","status":"publish","type":"msr-research-item","link":"https:\/\/www.microsoft.com\/en-us\/research\/publication\/maira-1-a-specialised-large-multimodal-model-for-radiology-report-generation\/","title":{"rendered":"MAIRA-1: A specialised large multimodal model for radiology report generation"},"content":{"rendered":"<p>We present a radiology-specific multimodal model for the task for generating radiological reports from chest X-rays (CXRs). Our work builds on the idea that large language model(s) can be equipped with multimodal capabilities through alignment with pre-trained vision encoders. On natural images, this has been shown to allow multimodal models to gain image understanding and description capabilities. Our proposed model (MAIRA-1) leverages a CXR-specific image encoder in conjunction with a fine-tuned large language model based on Vicuna-7B, and text-based data augmentation, to produce reports with state-of-the-art quality. In particular, MAIRA-1 significantly improves on the radiologist-aligned RadCliQ metric and across all lexical metrics considered. Manual review of model outputs demonstrates promising fluency and accuracy of generated reports while uncovering failure modes not captured by existing evaluation practices. More information and resources can be found on the project website:\u00a0<a class=\"link-external link-https\" href=\"https:\/\/aka.ms\/maira\" rel=\"external noopener nofollow\">this https URL<\/a>.<\/p>\n","protected":false},"excerpt":{"rendered":"<p>We present a radiology-specific multimodal model for the task for generating radiological reports from chest X-rays (CXRs). Our work builds on the idea that large language model(s) can be equipped with multimodal capabilities through alignment with pre-trained vision encoders. On natural images, this has been shown to allow multimodal models to gain image understanding and [&hellip;]<\/p>\n","protected":false},"featured_media":0,"template":"","meta":{"msr-url-field":"","msr-podcast-episode":"","msrModifiedDate":"","msrModifiedDateEnabled":false,"ep_exclude_from_search":false,"_classifai_error":"","msr-author-ordering":null,"msr_publishername":"","msr_publisher_other":"","msr_booktitle":"","msr_chapter":"","msr_edition":"","msr_editors":"","msr_how_published":"","msr_isbn":"","msr_issue":"","msr_journal":"","msr_number":"MSR-TR-2023-47","msr_organization":"Microsoft","msr_pages_string":"","msr_page_range_start":"","msr_page_range_end":"","msr_series":"","msr_volume":"","msr_copyright":"","msr_conference_name":"","msr_doi":"","msr_arxiv_id":"","msr_s2_paper_id":"","msr_mag_id":"","msr_pubmed_id":"","msr_other_authors":"","msr_other_contributors":"","msr_speaker":"","msr_award":"","msr_affiliation":"","msr_institution":"","msr_host":"","msr_version":"","msr_duration":"","msr_original_fields_of_study":"","msr_release_tracker_id":"","msr_s2_match_type":"","msr_citation_count_updated":"","msr_published_date":"2023-11-22","msr_highlight_text":"","msr_notes":"","msr_longbiography":"","msr_publicationurl":"","msr_external_url":"","msr_secondary_video_url":"","msr_conference_url":"","msr_journal_url":"","msr_s2_pdf_url":"","msr_year":0,"msr_citation_count":0,"msr_influential_citations":0,"msr_reference_count":0,"msr_s2_match_confidence":0,"msr_microsoftintellectualproperty":true,"msr_s2_open_access":false,"msr_s2_author_ids":[],"msr_pub_ids":[],"msr_hide_image_in_river":0,"footnotes":""},"msr-research-highlight":[],"research-area":[13556,13562],"msr-publication-type":[193718],"msr-publisher":[],"msr-focus-area":[],"msr-locale":[268875],"msr-post-option":[],"msr-field-of-study":[263203,263185],"msr-conference":[],"msr-journal":[],"msr-impact-theme":[261673],"msr-pillar":[],"class_list":["post-986898","msr-research-item","type-msr-research-item","status-publish","hentry","msr-research-area-artificial-intelligence","msr-research-area-computer-vision","msr-locale-en_us","msr-field-of-study-computation-and-language","msr-field-of-study-computer-vision-and-pattern-recognition"],"msr_publishername":"","msr_edition":"","msr_affiliation":"","msr_published_date":"2023-11-22","msr_host":"","msr_duration":"","msr_version":"","msr_speaker":"","msr_other_contributors":"","msr_booktitle":"","msr_pages_string":"","msr_chapter":"","msr_isbn":"","msr_journal":"","msr_volume":"","msr_number":"MSR-TR-2023-47","msr_editors":"","msr_series":"","msr_issue":"","msr_organization":"Microsoft","msr_how_published":"","msr_notes":"","msr_highlight_text":"","msr_release_tracker_id":"","msr_original_fields_of_study":"","msr_download_urls":"","msr_external_url":"","msr_secondary_video_url":"","msr_longbiography":"","msr_microsoftintellectualproperty":1,"msr_main_download":"","msr_publicationurl":"","msr_doi":"","msr_publication_uploader":[{"type":"url","viewUrl":"false","id":"false","title":"https:\/\/arxiv.org\/abs\/2311.13668","label_id":"243109","label":0}],"msr_related_uploader":"","msr_citation_count":0,"msr_citation_count_updated":"","msr_s2_paper_id":"","msr_influential_citations":0,"msr_reference_count":0,"msr_arxiv_id":"","msr_s2_author_ids":[],"msr_s2_open_access":false,"msr_s2_pdf_url":null,"msr_attachments":[],"msr-author-ordering":[{"type":"user_nicename","value":"Stephanie Hyland","user_id":38458,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Stephanie Hyland"},{"type":"user_nicename","value":"Shruthi Bannur","user_id":39213,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Shruthi Bannur"},{"type":"user_nicename","value":"Kenza Bouzid","user_id":43290,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Kenza Bouzid"},{"type":"user_nicename","value":"Daniel Coelho de Castro","user_id":39811,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Daniel Coelho de Castro"},{"type":"guest","value":"mercy-ranjit","user_id":981147,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=mercy-ranjit"},{"type":"user_nicename","value":"Anton Schwaighofer","user_id":31059,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Anton Schwaighofer"},{"type":"user_nicename","value":"Fernando P\u00e9rez Garc\u00eda","user_id":41473,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Fernando P\u00e9rez Garc\u00eda"},{"type":"user_nicename","value":"Valentina Salvatelli","user_id":40612,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Valentina Salvatelli"},{"type":"text","value":"Shaury Srivastav","user_id":0,"rest_url":false},{"type":"user_nicename","value":"Anja Thieme","user_id":35948,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Anja Thieme"},{"type":"user_nicename","value":"Noel Codella","user_id":41635,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Noel Codella"},{"type":"user_nicename","value":"Matthew Lungren","user_id":42792,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Matthew Lungren"},{"type":"text","value":"Maria Teodora Wetscherek","user_id":0,"rest_url":false},{"type":"user_nicename","value":"Ozan Oktay","user_id":38706,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Ozan Oktay"},{"type":"user_nicename","value":"Javier Alvarez-Valle","user_id":32137,"rest_url":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/microsoft-research\/v1\/researchers?person=Javier Alvarez-Valle"}],"msr_impact_theme":["Health"],"msr_research_lab":[199562,849856],"msr_event":[],"msr_group":[780706,1143270],"msr_project":[978063],"publication":[],"video":[],"msr-tool":[],"msr_publication_type":"techreport","related_content":{"projects":[{"ID":978063,"post_title":"Project MAIRA","post_name":"project-maira","post_type":"msr-project","post_date":"2023-11-24 01:00:00","post_modified":"2026-02-03 08:28:34","post_status":"publish","permalink":"https:\/\/www.microsoft.com\/en-us\/research\/project\/project-maira\/","post_excerpt":"Multimodal AI for Radiology Applications Project MAIRA is a research project from Microsoft Health Futures that builds innovative, multimodal AI technology to assist radiologists in delivering effective patient care and to empower them in their work. The goal of the project is to leverage rich healthcare data \u2013 including medical domain knowledge, temporal sequences of medical images and corresponding radiology reports, and other clinical context information \u2013 as inputs to developing multimodal frontier models that&hellip;","_links":{"self":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-project\/978063"}]}}]},"_links":{"self":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/986898","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item"}],"about":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/types\/msr-research-item"}],"version-history":[{"count":2,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/986898\/revisions"}],"predecessor-version":[{"id":1029297,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-item\/986898\/revisions\/1029297"}],"wp:attachment":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/media?parent=986898"}],"wp:term":[{"taxonomy":"msr-research-highlight","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-research-highlight?post=986898"},{"taxonomy":"msr-research-area","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/research-area?post=986898"},{"taxonomy":"msr-publication-type","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-publication-type?post=986898"},{"taxonomy":"msr-publisher","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-publisher?post=986898"},{"taxonomy":"msr-focus-area","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-focus-area?post=986898"},{"taxonomy":"msr-locale","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-locale?post=986898"},{"taxonomy":"msr-post-option","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-post-option?post=986898"},{"taxonomy":"msr-field-of-study","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-field-of-study?post=986898"},{"taxonomy":"msr-conference","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-conference?post=986898"},{"taxonomy":"msr-journal","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-journal?post=986898"},{"taxonomy":"msr-impact-theme","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-impact-theme?post=986898"},{"taxonomy":"msr-pillar","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-pillar?post=986898"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}