{"id":995577,"date":"2024-01-05T08:07:40","date_gmt":"2024-01-05T16:07:40","guid":{"rendered":"https:\/\/www.microsoft.com\/en-us\/research\/?post_type=msr-project&#038;p=995577"},"modified":"2025-08-08T10:48:04","modified_gmt":"2025-08-08T17:48:04","slug":"afmr-scientific-discovery-and-innovation","status":"publish","type":"msr-project","link":"https:\/\/www.microsoft.com\/en-us\/research\/project\/afmr-scientific-discovery-and-innovation\/","title":{"rendered":"AFMR: Scientific Discovery and Innovation"},"content":{"rendered":"<section class=\"mb-3 moray-highlight\">\n\t<div class=\"card-img-overlay mx-lg-0\">\n\t\t<div class=\"card-background  has-background- card-background--full-bleed\">\n\t\t\t<img loading=\"lazy\" decoding=\"async\" width=\"1920\" height=\"720\" src=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Scientific-Discovery-and-Innovation-page-header_1920x720.png\" class=\"attachment-full size-full\" alt=\"white icon of a beaker with particles inside and a circled checkmark to the side on a green gradient background\" style=\"\" srcset=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Scientific-Discovery-and-Innovation-page-header_1920x720.png 1920w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Scientific-Discovery-and-Innovation-page-header_1920x720-300x113.png 300w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Scientific-Discovery-and-Innovation-page-header_1920x720-1024x384.png 1024w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Scientific-Discovery-and-Innovation-page-header_1920x720-768x288.png 768w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Scientific-Discovery-and-Innovation-page-header_1920x720-1536x576.png 1536w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Scientific-Discovery-and-Innovation-page-header_1920x720-1600x600.png 1600w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Scientific-Discovery-and-Innovation-page-header_1920x720-240x90.png 240w\" sizes=\"auto, (max-width: 1920px) 100vw, 1920px\" \/>\t\t<\/div>\n\t\t<!-- Foreground -->\n\t\t<div class=\"card-foreground d-flex mt-md-n5 my-lg-5 px-g px-lg-0\">\n\t\t\t<!-- Container -->\n\t\t\t<div class=\"container d-flex mt-md-n5 my-lg-5 \">\n\t\t\t\t<!-- Card wrapper -->\n\t\t\t\t<div class=\"w-100 w-lg-col-5\">\n\t\t\t\t\t<!-- Card -->\n\t\t\t\t\t<div class=\"card material-md-card py-5 px-md-5\">\n\t\t\t\t\t\t<div class=\"card-body \">\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<a href=\"https:\/\/www.microsoft.com\/en-us\/research\/collaboration\/accelerating-foundation-models-research\/\" class=\"icon-link icon-link--reverse mb-2\" data-bi-cN=\"Accelerating Foundation Models Research\">\n\t\t\t\t\t\t\t\t\t<span class=\"c-glyph glyph-chevron-left\" aria-hidden=\"true\"><\/span>\n\t\t\t\t\t\t\t\t\tAccelerating Foundation Models Research\t\t\t\t\t\t\t\t<\/a>\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n<h1 class=\"wp-block-heading\" id=\"scientific-discovery-and-innovation\">Scientific Discovery and Innovation<\/h1>\n\n\t\t\t\t\t\t<\/div>\n\t\t\t\t\t<\/div>\n\t\t\t\t<\/div>\n\t\t\t<\/div>\n\t\t<\/div>\n\t<\/div>\n<\/section>\n\n\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\">\n<p><strong><em>Academic research plays such an important role in advancing science, technology, culture, and society. This grant program helps ensure this community has access to the latest and leading AI models.<\/em><\/strong><\/p>\n<cite>Brad Smith, Vice Chair and President<\/cite><\/blockquote>\n\n\n\n<div class=\"wp-block-columns is-layout-flex wp-container-core-columns-is-layout-9d6595d7 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\"><\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:50%\">\n<figure class=\"wp-block-image aligncenter size-full is-resized\"><img loading=\"lazy\" decoding=\"async\" width=\"400\" height=\"400\" src=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Discovery-Natural-Sciences_1.3.png\" alt=\"dark green icon of a lightbulb with a plant growing inside and a ring around the lightbulb\" class=\"wp-image-996369\" style=\"width:auto;height:150px\" srcset=\"https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Discovery-Natural-Sciences_1.3.png 400w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Discovery-Natural-Sciences_1.3-300x300.png 300w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Discovery-Natural-Sciences_1.3-150x150.png 150w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Discovery-Natural-Sciences_1.3-180x180.png 180w, https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2024\/01\/Discovery-Natural-Sciences_1.3-360x360.png 360w\" sizes=\"auto, (max-width: 400px) 100vw, 400px\" \/><\/figure>\n\n\n\n<h2 class=\"wp-block-heading has-text-align-center h4\" id=\"afmr-goal-accelerate-scientific-discovery-in-natural-sciences\">AFMR Goal: Accelerate scientific discovery in natural sciences<\/h2>\n\n\n\n<p class=\"has-text-align-center\">via proactive knowledge discovery, hypothesis generation, and multiscale multimodal data generation<\/p>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:25%\"><\/div>\n<\/div>\n\n\n\n<div style=\"padding-bottom:0; padding-top:0\" class=\"wp-block-msr-immersive-section alignfull row wp-block-msr-immersive-section\">\n\t\n\t<div class=\"container\">\n\t\t<div class=\"wp-block-msr-immersive-section__wrapper col-lg-11 col-xl-9 px-0 m-auto\">\n\t\t\t<div style=\"height:30px\" aria-hidden=\"true\" class=\"wp-block-spacer\"><\/div>\t\t<\/div>\n\t<\/div>\n\n\t<\/div>\n\n\n\n<p>These projects focus on using foundation models to enhance knowledge discovery and hypothesis generation across many different areas. They particularly leverage the ability of general models to make sense of the exponentially growing volume of scientific literature in astronomy, materials science, and neuroscience. These efforts include exploring domain-specific prompt engineering and specializing foundation models through fine-tuning using techniques such as Low-Rank Adaption (LoRA). A series of proposals are dedicated to biomedical and life sciences research and innovation, including specialized models for drug discovery, genomics, protein engineering, and rare diseases. These proposals underscore the potential of foundation models to accelerate scientific discovery and innovation across many fields and disciplines.<\/p>\n\n\n\n<div style=\"height:30px\" aria-hidden=\"true\" class=\"wp-block-spacer\"><\/div>\n\n\n\n\n\n<p><strong>University of Texas at Arlington<\/strong>: Miao Yin (PI)<\/p>\n\n\n\n<p>Ion chromatography (IC) is a powerful analytical chemistry technique for selective, sensitive quantification of aqueous ions spanning applications from environmental monitoring to biopharma pipelines. However, intrinsic slow analysis times severely throttle sample throughput. This project intends to develop an artificial intelligence-based platform accelerating IC by leveraging immense datasets from vast historical runs coupled with large foundation models tailored to effectively encode complex interactive influences of system parameters spanning columns, eluents, and detectors on separation performance into predictive modeling engines on Microsoft Azure. Additionally, a special tuning algorithm with analytical chemistry specialists&#8217; feedback will be developed to ensure the correct prediction of the large foundation IC model. Broader anticipated impacts are poised to revolutionize ion chromatography practices with AI across academic, manufacturing, and innovation areas while providing students at MSI with interdisciplinary research opportunities incorporating computer science and analytical chemistry.<\/p>\n\n\n\n\n\n<p><strong>Georgia Institute of Technology<\/strong>: Yunan Luo (PI)<\/p>\n\n\n\n<p>This proposal aims to leverage foundation models, including large language models trained on natural language and protein sequences, to advance protein function prediction and optimization. Two key areas of focus are 1) protein function prediction &#8211; predicting the biological roles of natural proteins and 2) protein function optimization &#8211; predicting which sequence mutations are beneficial for enhancing the function of natural proteins.<\/p>\n\n\n\n<p><strong>Related papers:<\/strong><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/www.nature.com\/articles\/s42256-023-00751-0\" target=\"_blank\" rel=\"noopener noreferrer\">Calibrated geometric deep learning improves kinase\u2013drug binding predictions<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/journals.plos.org\/ploscompbiol\/article?id=10.1371\/journal.pcbi.1012135\" target=\"_blank\" rel=\"noopener noreferrer\">Leveraging conformal prediction to annotate enzyme function space with limited false positives<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/www.nature.com\/articles\/s41467-024-50698-y\" target=\"_blank\" rel=\"noopener noreferrer\">Machine learning-guided co-optimization of fitness and diversity facilitates combinatorial library design in enzyme engineering<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" rel=\"noopener noreferrer\" target=\"_blank\" href=\"https:\/\/www.cell.com\/iscience\/fulltext\/S2589-0042(25)00379-7\">Pareto-optimal sampling for multi-objective protein sequence design<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n<\/ul>\n\n\n\n\n\n<p><strong>University College London<\/strong>: Bradley Love (PI)<\/p>\n\n\n\n<p>The project intends to utilize large language models (LLMs) to aid in the accumulation and assimilation of vast scientific literatures, especially in the field of neuroscience. The proposal aims to create BrainGPT, an AI tool for navigating and understanding large pools of data. The model will generate data patterns based on the scientific literature, assist in identifying anomalous findings, and offer insights for novel study designs. Additionally, the team intends to open source the models and training data for scientific scrutiny and improvements, fostering participation from the scientific community.<\/p>\n\n\n\n<p><strong>Related papers:<\/strong><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2411.11061\" target=\"_blank\" rel=\"noopener noreferrer\">Beyond Human-Like Processing: Large Language Models Perform Equivalently on Forward and Backward Scientific Text<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/www.nature.com\/articles\/s41562-024-02046-9\" target=\"_blank\" rel=\"noopener noreferrer\">Large language models surpass human experts in predicting neuroscience results<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2505.08739\" target=\"_blank\" rel=\"noopener noreferrer\">Probability Consistency in Large Language Models: Theoretical Foundations Meet Empirical Discrepancies<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n<\/ul>\n\n\n\n\n\n<p><strong>New Mexico State University<\/strong>: Huiping Cao (PI)<\/p>\n\n\n\n<p>Data-driven machine learning (ML) models built on large amounts of data have gained great success in many applications. However, their success is less observed in scientific domains. Scientific discoveries and hypothesis generation largely depends on knowledge (commonsense knowledge and expert-domain knowledge). Most of such knowledge is scattered in different sources and such knowledge is rarely utilized in data-driven ML models. Developing ML models that can take both data and knowledge as input in the learning process is still in its infancy.<\/p>\n\n\n\n<p>Many scientific domains collect multi-modality data. However, there is no good benchmark multi-modal datasets to evaluate foundation models.<\/p>\n\n\n\n<p>This project will design and develop novel neural network models to extract domain knowledge, incorporate domain knowledge and account for multi-modality data in the learning framework to improve learning accuracy and efficiency. The proposed methods will be applied to one scientific domain, animal sciences, to validate their usefulness, and generate knowledge base and a multi-modality datasets as a benchmark dataset.<\/p>\n\n\n\n\n\n<p><strong>University of California, Los Angeles<\/strong>: Aditya Grover (PI)<\/p>\n\n\n\n<p>The project proposes to develop a few-shot machine learning model to learn and optimize multi-task deep learning surrogates across various scientific and engineering domains. The plan includes unsupervised pretraining on large unlabelled datasets, followed by fine-tuning and evaluation on multiple disciplines, including bioengineering, material science, and mechanical design.<\/p>\n\n\n\n\n\n<p><strong>Harvard University<\/strong>: Alyssa Goodman (PI)<\/p>\n\n\n\n<p>We aim to enhance human interaction with astronomy literature by utilizing the capabilities of the Large Language Models, particularly GPT-4. We employ in-context prompting techniques to expose the model to astronomy papers to build an astronomy-focused chat application to engage the broader community. On the research track, we want to explore the potential foundation models have to generate novel scientific hypotheses. Specifically, we use GPT-4 to construct an instruction set of scientific ideas to fine-tune smaller models on this astronomy-specific downstream task. To assess their output\u2019s accuracy, feasibility and creativity, we employ a hybrid evaluation strategy consisting of human experts and judge GPT-4 instances. Our research will illuminate a novel and unique way of applying LLMs in the scientific arena.<\/p>\n\n\n\n<p><strong>Related papers:<\/strong><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2309.06126\" target=\"_blank\" rel=\"noopener noreferrer\">AstroLLaMA: Towards Specialized Foundation Models in Astronomy<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2401.01916\" target=\"_blank\" rel=\"noopener noreferrer\">AstroLLaMA-Chat: Scaling AstroLLaMA with Conversational and Diverse Datasets<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2409.19750\" target=\"_blank\" rel=\"noopener noreferrer\">AstroMLab 2: AstroLLaMA-2-70B Model and Benchmarking Specialised LLMs for Astronomy<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n<\/ul>\n\n\n\n\n\n<p><strong>University of Toronto Scarborough<\/strong>: Oleksandr Voznyy (PI)<\/p>\n\n\n\n<p>The proposal aims to establish Large Language Model (LLM) agents for inorganic materials discovery by augmenting GPT-3.5 with external tools and databases. The team will develop new text representations for the 3D structures of inorganic materials in order to enable discovery of materials for applications like catalysts, batteries, and photovoltaics.<\/p>\n\n\n\n\n\n<p><strong>Imperial College London<\/strong>: Aaron Zhao (PI)<\/p>\n\n\n\n<p>The proposal aims to enhance the understanding of complex genomic data by developing a novel Machine Learning framework. Through the use of a new mathematical formulation termed &#8216;hybrid graphs&#8217;, it is suggested that gene expression prediction can be improved beyond the capabilities of current sequence-based approaches. The proposal is also set to construct new databases and theoretical frameworks geared towards genomic data, addressing a current gap in the field.<\/p>\n\n\n\n<p><strong>Related papers:<\/strong><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2410.21345\" target=\"_blank\" rel=\"noopener noreferrer\">Absorb & Escape: Overcoming Single Model Limitations in Generating Genomic Sequences<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2310.06150\" target=\"_blank\" rel=\"noopener noreferrer\">Latent Diffusion Model for DNA Sequence Generation<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n<\/ul>\n\n\n\n\n\n<p><strong>University of Washington<\/strong>: Georg Seelig (PI)<\/p>\n\n\n\n<p>This proposal aims to develop a protein document dataset based on ontology and interaction annotations that will be used for developing protein language models (pLMs) capable of handling multi-protein inputs without linker strings. To take advantage of a structured protein training set, loss and training techniques inspired by natural language models such as RoBERTa will be used. To evaluate if the dataset and training approach generates more informative embeddings, we will evaluate using embeddings for tasks like functional prediction and low-N protein modeling.<\/p>\n\n\n\n\n\n<p><strong>University of New South Wales<\/strong>: Imran Razzak(PI) <\/p>\n\n\n\n<p> This research leverages Foundation Models to generate structured knowledge from materials science literature. Goals include enhancement of pre-existing datasets, making data in material science literature more discoverable, interoperable, and reusable, and simplifying the data mining workflow in materials science. The approach includes dataset management and construction, information extraction and inference, and knowledge discovery. <\/p>\n\n\n\n<p><strong>Related paper:<\/strong><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/neurips.cc\/virtual\/2024\/poster\/95920\" target=\"_blank\" rel=\"noopener noreferrer\">Construction and Application of Materials Knowledge Graph in Multidisciplinary Materials Science via Large Language Model<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n<\/ul>\n\n\n\n\n\n<p><strong>Yale University<\/strong>: Arman Cohan (PI)<\/p>\n\n\n\n<p>The proposal focuses on making connections within scholarly documents using AI to accelerate scientific discovery. It aims to develop NLP systems that can generate reliable and trustworthy long-form summaries in response to user queries. The ultimate goal is to make it easier for users to comprehend vast amounts of scientific literature and foster faster scientific exploration.<\/p>\n\n\n\n<p><strong>Related papers:<\/strong><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2311.09805\" target=\"_blank\" rel=\"noopener noreferrer\">DocMath-Eval: Evaluating Math Reasoning Capabilities of LLMs in Understanding Long and Specialized Documents<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/html\/2311.09797v2\" target=\"_blank\" rel=\"noopener noreferrer\">FinanceMath: Knowledge-Intensive Math Reasoning in Finance Domains<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2311.09721v1\" target=\"_blank\" rel=\"noopener noreferrer\">On Evaluating the Integration of Reasoning and Action in LLM Agents with Database Question Answering<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/aclanthology.org\/2024.acl-long.692.pdf\" target=\"_blank\" rel=\"noopener noreferrer\">TAPERA: Enhancing Faithfulness and Interpretability in Long-Form Table QA by Content Planning and Execution-based Reasoning<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n<\/ul>\n\n\n\n\n\n<p><strong>Carnegie Mellon University<\/strong>: Larry Pileggi (PI)<\/p>\n\n\n\n<p>The proposal presents a new approach to Situation Awareness based on a Physics-ML synergy approach for which both the physical and ML models are embedded throughout the process to augment each other. This synergy framework enables fast, accurate, and end-to-end situation awareness that integrates system identification, anomaly detection and root cause diagnosis capabilities. The approach incorporates state of the art ML into the operation pipeline of real systems toward advanced operational efficiency, security, and reliable automatic control decision-making.<\/p>\n\n\n\n\n\n<p><strong>University of California, San Francisco<\/strong>: Tanja Kortemme (PI) <\/p>\n\n\n\n<p>This proposal aims to train a foundation model, Frame2seq, for protein sequence design. Frame2seq is a structure-conditioned masked language model with state-of-the-art accuracy and speed.&nbsp;Frame2seq will accelerate design&nbsp;of new functional proteins&nbsp;by robustly sampling sequence space unexplored in nature. This research has broad applications in material science, biotechnology, synthetic&nbsp;biology, and medicine.<\/p>\n\n\n\n\n\n<p><strong>University of Illinois Urbana-Champaign<\/strong>: Haohan Wang (PI)<\/p>\n\n\n\n<p>This proposal aims to develop a Team of AI-made Scientists (TAIS) that could dissect complex research questions, pull knowledge from a vast array of academic literature and databases, and employ quantitative and qualitative analysis to uncover deeper insights. The project tackles two main points: statistical trustworthiness (developing mathematical principles and parameter-efficient learning frameworks for large models) and collaborative trustworthiness (formulating an interactive paradigm for large models to work together).<\/p>\n\n\n\n<p><strong>Related papers:<\/strong><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2406.15341\" target=\"_blank\" rel=\"noopener noreferrer\">GenoTEX: A Benchmark for Evaluating LLM-Based Exploration of Gene Expression Data in Alignment with Bioinformaticians<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2402.03299\" target=\"_blank\" rel=\"noopener noreferrer\">GUARD: Role-playing to Generate Natural-language Jailbreakings to Test Guideline Adherence of Large Language Models<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2405.20413\" target=\"_blank\" rel=\"noopener noreferrer\">Jailbreaking Large Language Models Against Moderation Guardrails via Cipher Characters<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2402.12391\" target=\"_blank\" rel=\"noopener noreferrer\">Toward a Team of AI-made Scientists for Scientific Discovery from Gene Expression Data<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n<\/ul>\n\n\n\n\n\n<p><strong>The Ohio State University<\/strong>: Yuan-Sen Ting (PI)<\/p>\n\n\n\n<p>The proposal aims to adapt Large Language Models (LLMs) to address complex research queries within the field of astronomy, where current general-purpose LLMs often fall short. The team proposes to develop Foundation models adapted for astronomical research, using over 300,000 LaTeX papers and employing GPT-4-generated instructions for precision fine-tuning. The resulting model will be used for conversational question-and-answering (QA) and hypothesis-generation tasks.<\/p>\n\n\n\n<p><strong>Related papers:<\/strong><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2401.01916\" target=\"_blank\" rel=\"noopener noreferrer\">AstroLLaMA-Chat: Scaling AstroLLaMA with Conversational and Diverse Datasets<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2409.19750\" target=\"_blank\" rel=\"noopener noreferrer\">AstroMLab 2: AstroLLaMA-2-70B Model and Benchmarking Specialised LLMs for Astronomy<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2411.09012\" target=\"_blank\" rel=\"noopener noreferrer\">AstroMLab 3: Achieving GPT-4o Level Performance in Astronomy with a Specialized 8B-Parameter Large Language Model<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2409.14807\" target=\"_blank\" rel=\"noopener noreferrer\">Interpreting Multi-band Galaxy Observations with Large Language Model-Based Agents<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2406.01391\" target=\"_blank\" rel=\"noopener noreferrer\">Knowledge Graph in Astronomical Research with Large Language Models: Quantifying Driving Forces in Interdisciplinary Scientific Discovery<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n\n\n\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/arxiv.org\/abs\/2408.01556\" target=\"_blank\" rel=\"noopener noreferrer\">pathfinder: A Semantic Framework for Literature Review and Knowledge Discovery in Astronomy<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n<\/ul>\n\n\n\n\n\n<p><strong>University of New South Wales<\/strong>: Bram Hoex (PI)<\/p>\n\n\n\n<p>The proposal emphasizes using unsupervised word embeddings for predicting functional materials through a comprehensive assessment of various embedding methodologies and foundational models. The research framework uses language models for scientific discovery and analysis of the latent knowledge in publications.<\/p>\n\n\n\n<p><strong>Related paper:<\/strong><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a class=\"msr-external-link glyph-append glyph-append-open-in-new-tab glyph-append-xsmall\" href=\"https:\/\/www.cell.com\/patterns\/fulltext\/S2666-3899(24)00054-0\" target=\"_blank\" rel=\"noopener noreferrer\">Creation of a structured solar cell material dataset and performance prediction using large language models<span class=\"sr-only\"> (opens in new tab)<\/span><\/a><\/li>\n<\/ul>\n\n\n\n\n\n<p><strong>Universit\u00e9 de Montr\u00e9al<\/strong>: Glen Berseth (PI)<\/p>\n\n\n\n<p>The proposed research project aims to explore how large language models (LLMs) can assist in reducing the search space over molecular design. The researchers plan to formulate the molecular search problem as a sequence-generation problem, and develop an approach that leverages text-based RL to enhance molecular discovery efforts. Proposed methods include improving the objectives of research with more grounded metrics for evaluation and enhancing generalization by curating and fine-tuning datasets from related design problems.<\/p>\n\n\n\n\n\n<p><strong>University of Washington<\/strong>: Sheng Wang (PI)<\/p>\n\n\n\n<p>We propose to develop GPT-BLIAM, a model that utilizes GPT models to generate sentence descriptions for diseases, proteins, and their interactions, to enable the prediction of protein-disease associations. Our team will evaluate the model using existing protein-disease association databases and incorporate domain knowledge from Human Phenotype Ontology to learn prompts for rare diseases. Our goal is to improve the quality of the protein and disease embeddings and develop a machine learning model that can predict unknown protein-disease associations.<\/p>\n\n\n\n\n\n<div style=\"height:25px\" aria-hidden=\"true\" class=\"wp-block-spacer\"><\/div>\n\n\n","protected":false},"excerpt":{"rendered":"<p>Academic research plays such an important role in advancing science, technology, culture, and society. This grant program helps ensure this community has access to the latest and leading AI models. via proactive knowledge discovery, hypothesis generation, and multiscale multimodal data generation These projects focus on using foundation models to enhance knowledge discovery and hypothesis generation [&hellip;]<\/p>\n","protected":false},"featured_media":995910,"template":"","meta":{"msr-url-field":"","msr-podcast-episode":"","msrModifiedDate":"","msrModifiedDateEnabled":false,"ep_exclude_from_search":false,"_classifai_error":"","footnotes":""},"research-area":[13556],"msr-locale":[268875],"msr-impact-theme":[],"msr-pillar":[],"class_list":["post-995577","msr-project","type-msr-project","status-publish","has-post-thumbnail","hentry","msr-research-area-artificial-intelligence","msr-locale-en_us","msr-archive-status-active"],"msr_project_start":"","related-publications":[],"related-downloads":[],"related-videos":[],"related-groups":[],"related-events":[],"related-opportunities":[],"related-posts":[],"related-articles":[],"tab-content":[],"related-researchers":[],"msr_research_lab":[],"msr_impact_theme":[],"_links":{"self":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-project\/995577","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-project"}],"about":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/types\/msr-project"}],"version-history":[{"count":29,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-project\/995577\/revisions"}],"predecessor-version":[{"id":1147608,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-project\/995577\/revisions\/1147608"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/media\/995910"}],"wp:attachment":[{"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/media?parent=995577"}],"wp:term":[{"taxonomy":"msr-research-area","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/research-area?post=995577"},{"taxonomy":"msr-locale","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-locale?post=995577"},{"taxonomy":"msr-impact-theme","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-impact-theme?post=995577"},{"taxonomy":"msr-pillar","embeddable":true,"href":"https:\/\/www.microsoft.com\/en-us\/research\/wp-json\/wp\/v2\/msr-pillar?post=995577"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}