<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>Mlops on David Lang</title>
    <link>https://www.davidlang.tech/tags/mlops/</link>
    <description>Recent content in Mlops on David Lang</description>
    <generator>Hugo</generator>
    <language>en</language>
    <lastBuildDate>Tue, 18 Feb 2025 00:00:00 +0000</lastBuildDate>
    <atom:link href="https://www.davidlang.tech/tags/mlops/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>How to Validate and Measure LLM Accuracy in Production</title>
      <link>https://www.davidlang.tech/posts/how-to-validate-and-measure-llm-accuracy-in-production/</link>
      <pubDate>Tue, 18 Feb 2025 00:00:00 +0000</pubDate>
      <guid>https://www.davidlang.tech/posts/how-to-validate-and-measure-llm-accuracy-in-production/</guid>
      <description>&lt;p&gt;Shipping an LLM feature without measurement is shipping a bug generator. Production validation combines automated metrics, human review, and business KPIs.&lt;/p&gt;&#xA;&lt;h2 id=&#34;levels-of-evaluation&#34;&gt;Levels of Evaluation&lt;/h2&gt;&#xA;&lt;ol&gt;&#xA;&lt;li&gt;&lt;strong&gt;Unit-level&lt;/strong&gt; - Schema validation, regex checks, refusal detection&lt;/li&gt;&#xA;&lt;li&gt;&lt;strong&gt;Golden set&lt;/strong&gt; - Curated Q&amp;amp;A pairs scored automatically&lt;/li&gt;&#xA;&lt;li&gt;&lt;strong&gt;Online&lt;/strong&gt; - User thumbs, task completion, support escalations&lt;/li&gt;&#xA;&lt;li&gt;&lt;strong&gt;Human&lt;/strong&gt; - Expert rubrics on sampled traffic&lt;/li&gt;&#xA;&lt;/ol&gt;&#xA;&lt;h2 id=&#34;metrics-that-matter&#34;&gt;Metrics That Matter&lt;/h2&gt;&#xA;&lt;ul&gt;&#xA;&lt;li&gt;&lt;strong&gt;Faithfulness&lt;/strong&gt; - Answer grounded in retrieved context (RAG)&lt;/li&gt;&#xA;&lt;li&gt;&lt;strong&gt;Relevance&lt;/strong&gt; - Addresses the user question&lt;/li&gt;&#xA;&lt;li&gt;&lt;strong&gt;Toxicity / PII&lt;/strong&gt; - Safety filters&lt;/li&gt;&#xA;&lt;li&gt;&lt;strong&gt;Latency and cost&lt;/strong&gt; - p95 tokens and dollars per session&lt;/li&gt;&#xA;&lt;/ul&gt;&#xA;&lt;h2 id=&#34;implementation-sketch&#34;&gt;Implementation Sketch&lt;/h2&gt;&#xA;&lt;div class=&#34;highlight&#34;&gt;&lt;pre tabindex=&#34;0&#34; style=&#34;color:#93a1a1;background-color:#002b36;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-text-size-adjust:none;&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;&lt;span style=&#34;color:#719e07&#34;&gt;def&lt;/span&gt; &lt;span style=&#34;color:#268bd2&#34;&gt;validate_response&lt;/span&gt;(answer: &lt;span style=&#34;color:#b58900&#34;&gt;str&lt;/span&gt;, context: &lt;span style=&#34;color:#b58900&#34;&gt;str&lt;/span&gt;) &lt;span style=&#34;color:#719e07&#34;&gt;-&amp;gt;&lt;/span&gt; &lt;span style=&#34;color:#b58900&#34;&gt;dict&lt;/span&gt;:&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;    &lt;span style=&#34;color:#719e07&#34;&gt;return&lt;/span&gt; {&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;        &lt;span style=&#34;color:#2aa198&#34;&gt;&amp;#34;has_citation&amp;#34;&lt;/span&gt;: &lt;span style=&#34;color:#2aa198&#34;&gt;&amp;#34;[source:&amp;#34;&lt;/span&gt; &lt;span style=&#34;color:#719e07&#34;&gt;in&lt;/span&gt; answer,&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;        &lt;span style=&#34;color:#2aa198&#34;&gt;&amp;#34;length_ok&amp;#34;&lt;/span&gt;: &lt;span style=&#34;color:#2aa198&#34;&gt;50&lt;/span&gt; &lt;span style=&#34;color:#719e07&#34;&gt;&amp;lt;&lt;/span&gt; &lt;span style=&#34;color:#b58900&#34;&gt;len&lt;/span&gt;(answer) &lt;span style=&#34;color:#719e07&#34;&gt;&amp;lt;&lt;/span&gt; &lt;span style=&#34;color:#2aa198&#34;&gt;4000&lt;/span&gt;,&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;        &lt;span style=&#34;color:#2aa198&#34;&gt;&amp;#34;grounded&amp;#34;&lt;/span&gt;: entailment_score(context, answer) &lt;span style=&#34;color:#719e07&#34;&gt;&amp;gt;&lt;/span&gt; &lt;span style=&#34;color:#2aa198&#34;&gt;0.7&lt;/span&gt;,&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;    }&#xA;&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;p&gt;Log scores to your observability stack (Datadog, LangSmith, Phoenix).&lt;/p&gt;</description>
    </item>
  </channel>
</rss>
