A Dynamic Benchmark for Image Understanding

1.0Microsoft Researchhttps://www.microsoft.com/en-us/researchNeel Joshihttps://www.microsoft.com/en-us/research/people/neel/A Dynamic Benchmark for Image Understanding - Microsoft Researchrich600338<blockquote class="wp-embedded-content" data-secret="VpSD52eujD"><a href="https://www.microsoft.com/en-us/research/project/image-understanding-benchmark/">A Dynamic Benchmark for Image Understanding</a></blockquote><iframe sandbox="allow-scripts" security="restricted" src="https://www.microsoft.com/en-us/research/project/image-understanding-benchmark/embed/#?secret=VpSD52eujD" width="600" height="338" title="“A Dynamic Benchmark for Image Understanding” — Microsoft Research" data-secret="VpSD52eujD" frameborder="0" marginwidth="0" marginheight="0" scrolling="no" class="wp-embedded-content"></iframe><script type="text/javascript"> /* <![CDATA[ */ /*! This file is auto-generated */ !function(d,l){"use strict";l.querySelector&&d.addEventListener&&"undefined"!=typeof URL&&(d.wp=d.wp||{},d.wp.receiveEmbedMessage||(d.wp.receiveEmbedMessage=function(e){var t=e.data;if((t||t.secret||t.message||t.value)&&!/[^a-zA-Z0-9]/.test(t.secret)){for(var s,r,n,a=l.querySelectorAll('iframe[data-secret="'+t.secret+'"]'),o=l.querySelectorAll('blockquote[data-secret="'+t.secret+'"]'),c=new RegExp("^https?:$","i"),i=0;i<o.length;i++)o[i].style.display="none";for(i=0;i<a.length;i++)s=a[i],e.source===s.contentWindow&&(s.removeAttribute("style"),"height"===t.message?(1e3<(r=parseInt(t.value,10))?r=1e3:~~r<200&&(r=200),s.height=r):"link"===t.message&&(r=new URL(s.getAttribute("src")),n=new URL(t.value),c.test(n.protocol))&&n.host===r.host&&l.activeElement===s&&(d.top.location.href=t.value))}},d.addEventListener("message",d.wp.receiveEmbedMessage,!1),l.addEventListener("DOMContentLoaded",function(){for(var e,t,s=l.querySelectorAll("iframe.wp-embedded-content"),r=0;r<s.length;r++)(t=(e=s[r]).getAttribute("data-secret"))||(t=Math.random().toString(36).substring(2,12),e.src+="#?secret="+t,e.setAttribute("data-secret",t)),e.contentWindow.postMessage({message:"ready",secret:t},"*")},!1)))}(window,document); //# sourceURL=https://www.microsoft.com/en-us/research/wp-includes/js/wp-embed.min.js /* ]]> */ </script> https://www.microsoft.com/en-us/research/wp-content/uploads/2024/06/spatialunderstanding-66734655c31e1.jpg1798395We have created a procedurally generatable, synthetic dataset for testing spatial reasoning, visual prompting, object recognition and detection. A key question for understanding multimodal model performance is how well is can understand images, in particular basic vs. detailed spatial understanding of images. These capabilities are needed for models to be used in real-world tasks, such […]