<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>Vision on David Lang</title>
    <link>https://www.davidlang.tech/tags/vision/</link>
    <description>Recent content in Vision on David Lang</description>
    <generator>Hugo</generator>
    <language>en</language>
    <lastBuildDate>Tue, 05 Nov 2024 00:00:00 +0000</lastBuildDate>
    <atom:link href="https://www.davidlang.tech/tags/vision/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>Multi-Modal AI: Working with Images and Text</title>
      <link>https://www.davidlang.tech/posts/multi-modal-ai-working-with-images-and-text/</link>
      <pubDate>Tue, 05 Nov 2024 00:00:00 +0000</pubDate>
      <guid>https://www.davidlang.tech/posts/multi-modal-ai-working-with-images-and-text/</guid>
      <description>&lt;p&gt;Multi-modal models accept images and text in one request-enabling document OCR, UI screenshot analysis, and visual Q&amp;amp;A.&lt;/p&gt;&#xA;&lt;h2 id=&#34;vision-api-example&#34;&gt;Vision API Example&lt;/h2&gt;&#xA;&lt;div class=&#34;highlight&#34;&gt;&lt;pre tabindex=&#34;0&#34; style=&#34;color:#93a1a1;background-color:#002b36;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-text-size-adjust:none;&#34;&gt;&lt;code class=&#34;language-typescript&#34; data-lang=&#34;typescript&#34;&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;&lt;span style=&#34;color:#268bd2&#34;&gt;const&lt;/span&gt; response &lt;span style=&#34;color:#719e07&#34;&gt;=&lt;/span&gt; &lt;span style=&#34;color:#719e07&#34;&gt;await&lt;/span&gt; openai.chat.completions.create({&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;  model&lt;span style=&#34;color:#719e07&#34;&gt;:&lt;/span&gt; &lt;span style=&#34;color:#2aa198&#34;&gt;&amp;#39;gpt-4o&amp;#39;&lt;/span&gt;,&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;  messages&lt;span style=&#34;color:#719e07&#34;&gt;:&lt;/span&gt; [&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;    {&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;      role&lt;span style=&#34;color:#719e07&#34;&gt;:&lt;/span&gt; &lt;span style=&#34;color:#2aa198&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;,&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;      content&lt;span style=&#34;color:#719e07&#34;&gt;:&lt;/span&gt; [&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;        { &lt;span style=&#34;color:#268bd2&#34;&gt;type&lt;/span&gt;&lt;span style=&#34;color:#719e07&#34;&gt;:&lt;/span&gt; &lt;span style=&#34;color:#2aa198&#34;&gt;&amp;#39;text&amp;#39;&lt;/span&gt;, text&lt;span style=&#34;color:#719e07&#34;&gt;:&lt;/span&gt; &lt;span style=&#34;color:#2aa198&#34;&gt;&amp;#39;What error is shown in this screenshot?&amp;#39;&lt;/span&gt; },&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;        { &lt;span style=&#34;color:#268bd2&#34;&gt;type&lt;/span&gt;&lt;span style=&#34;color:#719e07&#34;&gt;:&lt;/span&gt; &lt;span style=&#34;color:#2aa198&#34;&gt;&amp;#39;image_url&amp;#39;&lt;/span&gt;, image_url&lt;span style=&#34;color:#719e07&#34;&gt;:&lt;/span&gt; { url: &lt;span style=&#34;color:#dc322f&#34;&gt;imageDataUrl&lt;/span&gt; } },&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;      ],&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;    },&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;  ],&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;});&#xA;&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;h2 id=&#34;use-cases&#34;&gt;Use Cases&lt;/h2&gt;&#xA;&lt;p&gt;Receipt parsing, diagram explanation, accessibility alt-text generation, and visual regression triage.&lt;/p&gt;</description>
    </item>
  </channel>
</rss>
