<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>Ragas on David Lang</title>
    <link>https://www.davidlang.tech/tags/ragas/</link>
    <description>Recent content in Ragas on David Lang</description>
    <generator>Hugo</generator>
    <language>en</language>
    <lastBuildDate>Sat, 18 Oct 2025 00:00:00 +0000</lastBuildDate>
    <atom:link href="https://www.davidlang.tech/tags/ragas/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>Evaluating LLM Outputs: RAGAS, DeepEval, and Custom Metrics</title>
      <link>https://www.davidlang.tech/posts/evaluating-llm-outputs-ragas-deepeval-and-custom-metrics/</link>
      <pubDate>Sat, 18 Oct 2025 00:00:00 +0000</pubDate>
      <guid>https://www.davidlang.tech/posts/evaluating-llm-outputs-ragas-deepeval-and-custom-metrics/</guid>
      <description>&lt;p&gt;Frameworks like RAGAS and DeepEval codify LLM evaluation metrics so you can regression-test prompts and pipelines in CI.&lt;/p&gt;&#xA;&lt;h2 id=&#34;ragas-rag-assessment&#34;&gt;RAGAS (RAG Assessment)&lt;/h2&gt;&#xA;&lt;p&gt;Measures context precision/recall, faithfulness, and answer relevance-ideal for retrieval pipelines.&lt;/p&gt;&#xA;&lt;div class=&#34;highlight&#34;&gt;&lt;pre tabindex=&#34;0&#34; style=&#34;color:#93a1a1;background-color:#002b36;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-text-size-adjust:none;&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;&lt;span style=&#34;color:#719e07&#34;&gt;from&lt;/span&gt; ragas &lt;span style=&#34;color:#719e07&#34;&gt;import&lt;/span&gt; evaluate&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;&lt;span style=&#34;color:#719e07&#34;&gt;from&lt;/span&gt; ragas.metrics &lt;span style=&#34;color:#719e07&#34;&gt;import&lt;/span&gt; faithfulness, answer_relevancy&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;&#xA;&lt;/span&gt;&lt;/span&gt;&lt;span style=&#34;display:flex;&#34;&gt;&lt;span&gt;result &lt;span style=&#34;color:#719e07&#34;&gt;=&lt;/span&gt; evaluate(dataset&lt;span style=&#34;color:#719e07&#34;&gt;=&lt;/span&gt;eval_dataset, metrics&lt;span style=&#34;color:#719e07&#34;&gt;=&lt;/span&gt;[faithfulness, answer_relevancy])&#xA;&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/div&gt;&lt;h2 id=&#34;deepeval&#34;&gt;DeepEval&lt;/h2&gt;&#xA;&lt;p&gt;Offers pytest-style LLM tests, G-Eval, and hallucination metrics with CI integration.&lt;/p&gt;&#xA;&lt;h2 id=&#34;custom-metrics&#34;&gt;Custom Metrics&lt;/h2&gt;&#xA;&lt;p&gt;Domain-specific checks often outperform generic scores-JSON schema match, SQL execution success, unit test pass rate for codegen.&lt;/p&gt;</description>
    </item>
  </channel>
</rss>
