<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>Production on David Lang</title>
    <link>https://www.davidlang.tech/tags/production/</link>
    <description>Recent content in Production on David Lang</description>
    <generator>Hugo</generator>
    <language>en</language>
    <lastBuildDate>Sat, 28 Feb 2026 00:00:00 +0000</lastBuildDate>
    <atom:link href="https://www.davidlang.tech/tags/production/index.xml" rel="self" type="application/rss+xml" />
    <item>
      <title>Building Reliable AI Agents: Lessons from Production</title>
      <link>https://www.davidlang.tech/posts/building-reliable-ai-agents-lessons-from-production/</link>
      <pubDate>Sat, 28 Feb 2026 00:00:00 +0000</pubDate>
      <guid>https://www.davidlang.tech/posts/building-reliable-ai-agents-lessons-from-production/</guid>
      <description>&lt;p&gt;Production agents fail in boring ways: timeouts, tool errors, runaway loops, and silent wrong answers. Reliability engineering applies to agents too.&lt;/p&gt;&#xA;&lt;h2 id=&#34;hardening-checklist&#34;&gt;Hardening Checklist&lt;/h2&gt;&#xA;&lt;ul&gt;&#xA;&lt;li&gt;Max steps and token budgets per session&lt;/li&gt;&#xA;&lt;li&gt;Idempotent tools with clear error messages&lt;/li&gt;&#xA;&lt;li&gt;Checkpoint state for long workflows&lt;/li&gt;&#xA;&lt;li&gt;Circuit breakers when external APIs fail&lt;/li&gt;&#xA;&lt;li&gt;Structured logging of every tool call&lt;/li&gt;&#xA;&lt;/ul&gt;&#xA;&lt;h2 id=&#34;graceful-degradation&#34;&gt;Graceful Degradation&lt;/h2&gt;&#xA;&lt;p&gt;When the agent fails, fall back to search-only RAG or human handoff-never an empty error.&lt;/p&gt;</description>
    </item>
  </channel>
</rss>
