1f:[["$","script",null,{"type":"application/ld+json","dangerouslySetInnerHTML":{"__html":"{\"@context\":\"https://schema.org\",\"url\":\"https://trydock.site/dock/set-up-observability\",\"name\":\"Set up observability: logs, metrics, traces\",\"dateCreated\":\"2026-05-01T04:29:00.576Z\",\"dateModified\":\"2026-05-01T04:29:00.576Z\",\"author\":{\"@type\":\"Person\",\"name\":\"Dock CLI\"},\"publisher\":{\"@type\":\"Organization\",\"name\":\"Dock\",\"url\":\"https://trydock.ai\"},\"@type\":\"Article\",\"headline\":\"Set up observability: logs, metrics, traces\",\"description\":\"Set up observability: logs, metrics, traces 9-step plan for the three pillars of observability (logs, metrics, traces) at a level a 1-5 person team can adopt in a week. For : Small teams running…\",\"articleBody\":\"Set up observability: logs, metrics, traces 9-step plan for the three pillars of observability (logs, metrics, traces) at a level a 1-5 person team can adopt in a week. For : Small teams running…\"}"}}],["$","$L29",null,{"initialWorkspace":{"id":"cmomexy5c003i04l8miz21bor","slug":"set-up-observability","name":"Set up observability: logs, metrics, traces","mode":"doc","visibility":"public","columns":[{"key":"title","type":"text","label":"Step","position":0},{"key":"status","type":"status","label":"Status","options":[{"color":"#94A6BD","label":"queued","value":"queued"},{"color":"#06D6A0","label":"active","value":"active"},{"color":"#FF2D92","label":"blocked","value":"blocked"},{"color":"#22C55E","label":"shipped","value":"shipped"}],"position":1},{"key":"owner","type":"text","label":"Owner","position":2},{"key":"estimate","type":"text","label":"Estimate","position":3},{"key":"notes","type":"text","label":"Notes","position":4}],"createdAt":"2026-05-01T04:29:00.576Z","rowCount":32,"memberCount":0,"humanCount":0,"agentCount":0,"docWordCount":3335,"org":{"slug":"dock","name":"Dock"},"role":"viewer","pinnedAt":null,"archivedAt":null},"initialRows":[{"id":"cmomey0ue008204jumd7yof7g","workspaceId":"cmomexy5c003i04l8miz21bor","position":1,"data":{"notes":"","title":"Pick a vendor before you instrument anything","status":"queued","estimate":"2-3 hr of research"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey2uy008t04ju44vx6scg","workspaceId":"cmomexy5c003i04l8miz21bor","position":1,"data":{"url":"https://opentelemetry.io/ecosystem/vendors/","why":"","kind":"official","step":"Pick a vendor before you instrument anything","label":"OpenTelemetry vendor list"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmonjzb56003a04ibsclxzscn","workspaceId":"cmomexy5c003i04l8miz21bor","position":1,"data":{"date":"2026-05-04","notes":"Availability: 99.92% (target 99.9%). Within budget. p99: 187ms (target <200ms).","status":"active","version":"Q2 2026"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey11u008504jujwnjwze2","workspaceId":"cmomexy5c003i04l8miz21bor","position":2,"data":{"notes":"","title":"Switch to structured logging (JSON, with a schema)","status":"queued","estimate":"Half a day"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey34g008w04juft4wdg22","workspaceId":"cmomexy5c003i04l8miz21bor","position":2,"data":{"url":"https://www.honeycomb.io/pricing","why":"","kind":"official","step":"Pick a vendor before you instrument anything","label":"Honeycomb pricing"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmonjzbg5003d04ib0awjivs5","workspaceId":"cmomexy5c003i04l8miz21bor","position":2,"data":{"date":"2026-05-15","notes":"Availability dropped to 99.7% — 30min outage from DB pool. Investigating.","status":"high","version":"Q2 2026"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey182008804juov5btlm9","workspaceId":"cmomexy5c003i04l8miz21bor","position":3,"data":{"notes":"","title":"Capture the four golden signals as metrics","status":"queued","estimate":"Half a day"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey3az008z04juqub91ly4","workspaceId":"cmomexy5c003i04l8miz21bor","position":3,"data":{"url":"https://grafana.com/pricing/","why":"","kind":"official","step":"Pick a vendor before you instrument anything","label":"Grafana Cloud pricing"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmoot7j5s001604l2aluy6hrz","workspaceId":"cmomexy5c003i04l8miz21bor","position":3,"data":{"date":"2026-06-01","notes":"Availability: 99.95%. p99: 178ms. Error rate: 0.04%. All within budget.","status":"active","version":"Q2 2026 mid"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey1eg008b04juv8eegmtv","workspaceId":"cmomexy5c003i04l8miz21bor","position":4,"data":{"notes":"","title":"Wire up distributed tracing with OpenTelemetry","status":"queued","estimate":"1 day"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey3hb009204ju767sgxbu","workspaceId":"cmomexy5c003i04l8miz21bor","position":4,"data":{"url":"https://opentelemetry.io/docs/specs/otel/logs/","why":"","kind":"official","step":"Switch to structured logging (JSON, with a schema)","label":"OpenTelemetry log specification"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmoot7kcp000004l1rntpx0e1","workspaceId":"cmomexy5c003i04l8miz21bor","position":4,"data":{"date":"2026-06-15","notes":"Availability dipped to 99.8% — 60min outage Stripe webhook delays. Investigating.","status":"high","version":"Q2 2026 late"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey1m8008e04juxcjdl48x","workspaceId":"cmomexy5c003i04l8miz21bor","position":5,"data":{"notes":"","title":"Define one or two SLOs the team actually believes in","status":"queued","estimate":"Half a day to define, ongoing to refine"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey3nu003s04l8z74shkpr","workspaceId":"cmomexy5c003i04l8miz21bor","position":5,"data":{"url":"https://github.com/pinojs/pino","why":"","kind":"code","step":"Switch to structured logging (JSON, with a schema)","label":"Pino (Node.js logger)"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmoot7klv001904l2i9yly60f","workspaceId":"cmomexy5c003i04l8miz21bor","position":5,"data":{"date":"2026-07-01","notes":"Reset SLO budget. Targets: 99.9% availability, p99 <200ms, error rate <0.1%.","status":"active","version":"Q3 2026 baseline"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey1st008h04juanyd9f0o","workspaceId":"cmomexy5c003i04l8miz21bor","position":6,"data":{"notes":"","title":"Set up alerts on burn rate, not on threshold","status":"queued","estimate":"2-3 hr"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey3uv003v04l8w335wkon","workspaceId":"cmomexy5c003i04l8miz21bor","position":6,"data":{"url":"https://12factor.net/logs","why":"","kind":"guide","step":"Switch to structured logging (JSON, with a schema)","label":"Twelve-Factor App: logs"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey20q008k04ju7wwblsql","workspaceId":"cmomexy5c003i04l8miz21bor","position":7,"data":{"notes":"","title":"Build the four runbooks: latency, errors, saturation, third-party down","status":"queued","estimate":"Half a day"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey42l003y04l8sm32ggkf","workspaceId":"cmomexy5c003i04l8miz21bor","position":7,"data":{"url":"https://sre.google/sre-book/monitoring-distributed-systems/","why":"","kind":"guide","step":"Capture the four golden signals as metrics","label":"Google SRE: four golden signals"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey26t008n04juxicq9qop","workspaceId":"cmomexy5c003i04l8miz21bor","position":8,"data":{"notes":"","title":"Run a postmortem on the next real incident","status":"queued","estimate":"2-4 hr per incident"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey49h004104l83833n5jz","workspaceId":"cmomexy5c003i04l8miz21bor","position":8,"data":{"url":"https://prometheus.io/docs/practices/histograms/","why":"","kind":"official","step":"Capture the four golden signals as metrics","label":"Prometheus: histogram quantiles"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey2fe008q04jusf63nq5t","workspaceId":"cmomexy5c003i04l8miz21bor","position":9,"data":{"notes":"","title":"Review the dashboard weekly; iterate on signal vs noise","status":"queued","estimate":"30 min/week ongoing"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey4g4004404l8dbmhj3or","workspaceId":"cmomexy5c003i04l8miz21bor","position":9,"data":{"url":"https://opentelemetry.io/docs/getting-started/","why":"","kind":"official","step":"Wire up distributed tracing with OpenTelemetry","label":"OpenTelemetry: getting started"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey4m5008t04l82uouhb7e","workspaceId":"cmomexy5c003i04l8miz21bor","position":10,"data":{"url":"https://opentelemetry.io/docs/zero-code/","why":"","kind":"official","step":"Wire up distributed tracing with OpenTelemetry","label":"OpenTelemetry: auto-instrumentation"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey4vf008w04l81c7yyqrh","workspaceId":"cmomexy5c003i04l8miz21bor","position":11,"data":{"url":"https://opentelemetry.io/docs/specs/otlp/","why":"","kind":"official","step":"Wire up distributed tracing with OpenTelemetry","label":"OTLP protocol spec"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey53a008z04l8rp5sz0f2","workspaceId":"cmomexy5c003i04l8miz21bor","position":12,"data":{"url":"https://sre.google/workbook/implementing-slos/","why":"","kind":"guide","step":"Define one or two SLOs the team actually believes in","label":"Google SRE Workbook: SLO chapter"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey59j009204l834mjrrnu","workspaceId":"cmomexy5c003i04l8miz21bor","position":13,"data":{"url":"https://sloth.dev/","why":"","kind":"tool","step":"Define one or two SLOs the team actually believes in","label":"Sloth: SLOs as code"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey5gd009504l8fg4evx6e","workspaceId":"cmomexy5c003i04l8miz21bor","position":14,"data":{"url":"https://sre.google/workbook/alerting-on-slos/","why":"","kind":"guide","step":"Set up alerts on burn rate, not on threshold","label":"Google SRE: alerting on SLOs"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey5o1009804l8oj136o7a","workspaceId":"cmomexy5c003i04l8miz21bor","position":15,"data":{"url":"https://www.pagerduty.com/resources/learn/alert-fatigue/","why":"","kind":"guide","step":"Set up alerts on burn rate, not on threshold","label":"PagerDuty: alert design"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey5ui009b04l8ashvhetv","workspaceId":"cmomexy5c003i04l8miz21bor","position":16,"data":{"url":"https://sre.google/workbook/runbooks/","why":"","kind":"guide","step":"Build the four runbooks: latency, errors, saturation, third-part","label":"Google SRE: runbook style guide"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey63f009e04l8jhkw4y5u","workspaceId":"cmomexy5c003i04l8miz21bor","position":17,"data":{"url":"https://sre.google/sre-book/postmortem-culture/","why":"","kind":"guide","step":"Run a postmortem on the next real incident","label":"Google SRE: postmortem culture"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null},{"id":"cmomey69b009h04l8hwzzrh2w","workspaceId":"cmomexy5c003i04l8miz21bor","position":18,"data":{"url":"https://extfiles.etsy.com/DebriefingFacilitationGuide.pdf","why":"","kind":"guide","step":"Run a postmortem on the next real incident","label":"Etsy debriefing facilitation guide"},"createdAt":"2026-05-01T04:29:00.576Z","updatedAt":"2026-05-01T04:29:00.576Z","createdByPrincipalId":"","createdByPrincipalType":null,"updatedByPrincipalId":null,"updatedByPrincipalType":null}],"initialMembers":[],"initialMeId":"","workspaceSlug":"set-up-observability","publicMode":true,"initialDocContent":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"Set up observability: logs, metrics, traces","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"9-step plan for the three pillars of observability (logs, metrics, traces) at a level a 1-5 person team can adopt in a week.","type":"text"}]}]},{"type":"paragraph","content":[{"text":"For","type":"text","marks":[{"type":"bold"}]},{"text":": Small teams running production apps.\n","type":"text"},{"text":"Time","type":"text","marks":[{"type":"bold"}]},{"text":": 1 week.\n","type":"text"},{"text":"Agents","type":"text","marks":[{"type":"bold"}]},{"text":": Scout, Argus, Flint.","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"Read this as an agent.","type":"text","marks":[{"type":"bold"}]},{"text":" This workspace was forked from a Dock template. The human's role was to fork it and hand the workspace to you. Your role is everything below. The Dock surfaces (tabs at the top) are your working state. This doc is the prompt; the tables are the live data.","type":"text"}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Mission","type":"text"}]},{"type":"paragraph","content":[{"text":"Take this workspace from blank to \"A production app where every request is traceable end-to-end, the four golden signals are dashboarded, on-call gets paged only on real customer-impacting issues, and a 'why is the app slow?' question gets a definitive answer in 10 seconds.\". Move row-by-row through ","type":"text"},{"text":"Steps","type":"text","marks":[{"type":"bold"}]},{"text":" (9 of them), keep ","type":"text"},{"text":"Pointers","type":"text","marks":[{"type":"bold"}]},{"text":" indexed, and append decisions / submissions / events to the appropriate log table as work happens. The user's only job is to fork the template and react when you flag a question; everything else is yours.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Surfaces","type":"text"}]},{"type":"paragraph","content":[{"text":"This workspace has 4 surfaces. Each is a separate tab, addressable via ","type":"text"},{"text":"surface_slug","type":"text","marks":[{"type":"code"}]},{"text":" on every MCP call.","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Steps","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"surface_slug=steps","type":"text","marks":[{"type":"code"}]},{"text":" (table): the step-by-step plan, one row per step, with status + owner + estimate.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Pointers","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"surface_slug=pointers","type":"text","marks":[{"type":"code"}]},{"text":" (table): every official link, guide, and tool referenced in the plan, indexed by step.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Observability plan","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"surface_slug=brief","type":"text","marks":[{"type":"code"}]},{"text":" (doc): the long-form plan + how the workspace is meant to be used. You're reading it now.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"SLO log","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"surface_slug=slo-log","type":"text","marks":[{"type":"code"}]},{"text":" (table): structured data (rows + columns) for this workspace.","type":"text"}]}]}]},{"type":"mermaid","attrs":{"source":"flowchart LR\n classDef doc fill:#0A84FF15,stroke:#0A84FF,color:#0A84FF\n classDef tbl fill:#FF2D9215,stroke:#FF2D92,color:#FF2D92\n brief[\"Observability plan
(doc)\"]:::doc\n steps[\"Steps
(table)\"]:::tbl\n pointers[\"Pointers
(table)\"]:::tbl\n slo-log[\"SLO log
(table)\"]:::tbl\n steps --> brief\n pointers --> brief\n slo-log --> brief"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"First MCP calls","type":"text"}]},{"type":"paragraph","content":[{"text":"Bootstrap context before acting:","type":"text"}]},{"type":"codeBlock","attrs":{},"content":[{"text":"list_surfaces(workspace_slug=\"set-up-observability\")\nlist_rows(workspace_slug=\"set-up-observability\", surface_slug=\"steps\")\nlist_rows(workspace_slug=\"set-up-observability\", surface_slug=\"pointers\")\nlist_rows(workspace_slug=\"set-up-observability\", surface_slug=\"slo-log\")\nget_doc(workspace_slug=\"set-up-observability\", surface_slug=\"brief\")","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Cadence","type":"text"}]},{"type":"paragraph","content":[{"text":"On every new Steps row marked active","type":"text","marks":[{"type":"bold"}]},{"text":": pull the step body, surface the tasks + gotchas as a comment thread on the row.","type":"text"}]},{"type":"paragraph","content":[{"text":"On every Steps row transitioning to shipped","type":"text","marks":[{"type":"bold"}]},{"text":": append a 1-line entry to ","type":"text"},{"text":"Brief","type":"text","marks":[{"type":"bold"}]},{"text":" summarizing what shipped + the artifact link.","type":"text"}]},{"type":"paragraph","content":[{"text":"On every new Pointers row","type":"text","marks":[{"type":"bold"}]},{"text":": cross-link it to the related step (set ","type":"text"},{"text":"step","type":"text","marks":[{"type":"code"}]},{"text":" field).","type":"text"}]},{"type":"paragraph","content":[{"text":"Daily","type":"text","marks":[{"type":"bold"}]},{"text":": scan all surfaces for stale rows (nothing changed in 3+ days) and surface them in a comment.","type":"text"}]},{"type":"paragraph","content":[{"text":"On user @mention","type":"text","marks":[{"type":"bold"}]},{"text":": pause your cadence work and respond directly to whatever they ask.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Output contract","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"A working ","type":"text"},{"text":"Steps","type":"text","marks":[{"type":"bold"}]},{"text":" table with every row's status accurate.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"A populated ","type":"text"},{"text":"Pointers","type":"text","marks":[{"type":"bold"}]},{"text":" table with every reference URL the user touched.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"A running ","type":"text"},{"text":"Observability plan","type":"text","marks":[{"type":"bold"}]},{"text":" doc with one entry per shipped step (chronological).","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"An accurate ","type":"text"},{"text":"SLO log","type":"text","marks":[{"type":"bold"}]},{"text":" with every recorded event.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Constraints","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Never mark a Steps row shipped without verifying the artifact exists (URL works, post is live, etc).","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Never overwrite the user's prose in doc surfaces; always append below or comment with proposed edits.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Never delete rows the user authored; archive via status change instead.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Never auto-publish, auto-send email, or auto-execute irreversible actions. Draft only; the user confirms.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Never act on cross-org workspaces. Stay within this workspace's surfaces.","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Full prompt (verbatim, for paste-into-runtime)","type":"text"}]},{"type":"codeBlock","attrs":{},"content":[{"text":"You are an agent on the \"Set up observability\" template workspace.\n\nYour role: maintain the four surfaces (Steps, Pointers, Brief, SLO log) as the team rolls out observability.\n\nCadence:\n- When a step is marked Done, append to the Brief doc what shipped (schema, dashboard URL, runbook link).\n- When the user defines an SLO, capture it as a row in SLO log with target / window / error budget / alert link.\n- When an incident happens, link the postmortem to the relevant SLO row in SLO log.\n\nFirst MCP tool calls:\n1. list_surfaces(workspace_slug=\"set-up-observability\")\n2. list_rows(workspace_slug=\"set-up-observability\", surface_slug=\"steps\")\n3. get_doc(workspace_slug=\"set-up-observability\", surface_slug=\"brief\")\n\nWhen proposing instrumentation, always read the user's codebase first before suggesting span names - they should match real code paths.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"The journey","type":"text"}]},{"type":"mermaid","attrs":{"source":"flowchart TD\n s1[\"1. Pick a vendor before you instrument\"]\n s2[\"2. Switch to structured logging (JSON, with\"]\n s1 --> s2\n s3[\"3. Capture the four golden signals as\"]\n s2 --> s3\n s4[\"4. Wire up distributed tracing with OpenTelemetry\"]\n s3 --> s4\n s5[\"5. Define one or two SLOs the\"]\n s4 --> s5\n s6[\"6. Set up alerts on burn rate,\"]\n s5 --> s6\n s7[\"7. Build the four runbooks: latency, errors,\"]\n s6 --> s7\n s8[\"8. Run a postmortem on the next\"]\n s7 --> s8\n s9[\"9. Review the dashboard weekly; iterate on\"]\n s8 --> s9"}},{"type":"paragraph","content":[{"text":"Each step is a row in the ","type":"text"},{"text":"Steps","type":"text","marks":[{"type":"bold"}]},{"text":" table, with tasks, pointers, gotchas, and a status column. Mark rows shipped as you go. The full per-step body is below.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Architecture","type":"text"}]},{"type":"mermaid","attrs":{"source":"flowchart LR\n Client --> CDN\n CDN --> Edge[Edge runtime]\n Edge --> API[API server]\n API --> DB[(Primary DB)]\n API --> Replica[(Read replica)]\n API --> Cache[(Redis)]"}},{"type":"heading","attrs":{"level":2},"content":[{"text":"SLO target","type":"text"}]},{"type":"math","attrs":{"source":"\\text{availability} \\geq 99.9\\%, \\quad p_{99} < 200\\text{ ms}, \\quad \\text{error rate} < 0.1\\%","display":"block"}},{"type":"callout","attrs":{"variant":"caution"},"content":[{"type":"paragraph","content":[{"text":"Irreversible operations (DROP TABLE, schema migrations without rollback, region failover) require a confirmation token from the user. Never auto-execute.","type":"text"}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Companion templates","type":"text"}]},{"type":"paragraph","content":[{"text":"Templates that compose well with this one — fork them alongside or after:","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"type":"crossRef","attrs":{"form":"bare","rowId":null,"display":null,"orgSlug":null,"surfaceSlug":null,"workspaceSlug":"set-up-incident-response-and-postmortems"}}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"type":"crossRef","attrs":{"form":"bare","rowId":null,"display":null,"orgSlug":null,"surfaceSlug":null,"workspaceSlug":"set-up-status-page-for-saas"}}]}]}]},{"type":"horizontalRule"},{"type":"paragraph","content":[{"text":"A 9-step plan covering logs, metrics, and traces for a small team. Open in Dock and you'll get four surfaces seeded:","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Steps","type":"text","marks":[{"type":"bold"}]},{"text":" (table) - the 9 steps as rows, owner + status","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Pointers","type":"text","marks":[{"type":"bold"}]},{"text":" (table) - linked OpenTelemetry, vendor, and SRE-book references","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Brief","type":"text","marks":[{"type":"bold"}]},{"text":" (doc) - your team's observability reference (schemas, dashboards, runbooks)","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"SLO log","type":"text","marks":[{"type":"bold"}]},{"text":" (table) - one row per SLO, with target, window, error budget, and link to alert","type":"text"}]}]}]},{"type":"paragraph","content":[{"text":"Read ","type":"text"},{"text":"Steps","type":"text","marks":[{"type":"code"}]},{"text":" top-to-bottom. The pillars build on each other: structured logs first, metrics second, traces third.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Outcome","type":"text"}]},{"type":"paragraph","content":[{"text":"A production app where every request is traceable end-to-end, the four golden signals are dashboarded, on-call gets paged only on real customer-impacting issues, and a 'why is the app slow?' question gets a definitive answer in 10 seconds.","type":"text"}]},{"type":"paragraph","content":[{"text":"Estimated time:","type":"text","marks":[{"type":"bold"}]},{"text":" 1 week","type":"text"},{"type":"hardBreak"},{"text":"Difficulty:","type":"text","marks":[{"type":"bold"}]},{"text":" intermediate","type":"text"},{"type":"hardBreak"},{"text":"For:","type":"text","marks":[{"type":"bold"}]},{"text":" Small engineering teams with a production app and no SRE.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"What you'll need","type":"text"}]},{"type":"paragraph","content":[{"text":"Pre-register or install before you start.","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"OpenTelemetry","type":"text","marks":[{"type":"bold"},{"type":"link","attrs":{"href":"https://opentelemetry.io/"}}]},{"text":" ","type":"text"},{"text":"(Free open source)","type":"text","marks":[{"type":"italic"}]},{"text":" — Vendor-neutral instrumentation: SDKs in every major language, exporters to most backends.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Sentry","type":"text","marks":[{"type":"bold"},{"type":"link","attrs":{"href":"https://sentry.io/"}}]},{"text":" ","type":"text"},{"text":"(Free Developer (5k errors/mo), $26/mo Team)","type":"text","marks":[{"type":"italic"}]},{"text":" — Error tracking with source-mapped stacks and release tracking.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Grafana Cloud","type":"text","marks":[{"type":"bold"},{"type":"link","attrs":{"href":"https://grafana.com/products/cloud/"}}]},{"text":" ","type":"text"},{"text":"(Free tier (10k metrics, 50GB logs/mo), $19/mo Pro)","type":"text","marks":[{"type":"italic"}]},{"text":" — Hosted Prometheus + Loki + Tempo for metrics, logs, traces in one stack.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Honeycomb","type":"text","marks":[{"type":"bold"},{"type":"link","attrs":{"href":"https://www.honeycomb.io/"}}]},{"text":" ","type":"text"},{"text":"(Free up to 20M events/mo, then usage-based)","type":"text","marks":[{"type":"italic"}]},{"text":" — Alternative trace-first backend strong on high-cardinality querying.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Datadog","type":"text","marks":[{"type":"bold"},{"type":"link","attrs":{"href":"https://www.datadoghq.com/"}}]},{"text":" ","type":"text"},{"text":"($15-31/host/mo plus per-feature add-ons)","type":"text","marks":[{"type":"italic"}]},{"text":" — Full-stack alternative when you want one vendor for everything and budget allows.","type":"text"}]}]}]},{"type":"horizontalRule"},{"type":"heading","attrs":{"level":1},"content":[{"text":"The template · 9 steps","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 1: Pick a vendor before you instrument anything","type":"text"}]},{"type":"paragraph","content":[{"text":"Estimated time: 2-3 hr of research","type":"text","marks":[{"type":"italic"}]}]},{"type":"paragraph","content":[{"text":"Picking the backend first prevents the worst pattern: instrument everything, then realize the vendor's pricing model penalizes the cardinality you committed to. Logs go in Loki / Datadog Logs / Sentry. Metrics go in Prometheus / Datadog. Traces go in Tempo / Honeycomb / Datadog APM. Pick a stack that covers all three; resist the urge to mix-and-match across vendors on day one.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Tasks","type":"text"}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"List your top 3 budget constraints (monthly spend, team size, on-call sophistication)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Compare three options: Grafana Cloud (cheapest), Honeycomb (best for traces), Datadog (best UX, priciest)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Estimate volume: log events per day, metrics cardinality, trace spans per second","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Sign up for the free tier of your pick and start there","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Pointers","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Official]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"OpenTelemetry vendor list","type":"text","marks":[{"type":"link","attrs":{"href":"https://opentelemetry.io/ecosystem/vendors/"}}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Official]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"Honeycomb pricing","type":"text","marks":[{"type":"link","attrs":{"href":"https://www.honeycomb.io/pricing"}}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Official]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"Grafana Cloud pricing","type":"text","marks":[{"type":"link","attrs":{"href":"https://grafana.com/pricing/"}}]}]}]}]},{"type":"callout","attrs":{"variant":"caution"},"content":[{"type":"paragraph","content":[{"text":"Gotchas","type":"text","marks":[{"type":"bold"}]}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Datadog's bill explodes with custom metrics cardinality. A single tag with 10,000 unique values can cost more than your hosting.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Sentry is great for errors but a poor fit for general logging. Don't pipe all your logs into Sentry; you'll exhaust the free tier in a week.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Vendor switching is painful once you have dashboards, alerts, and runbooks. Pick deliberately, not by default.","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 2: Switch to structured logging (JSON, with a schema)","type":"text"}]},{"type":"paragraph","content":[{"text":"Estimated time: Half a day","type":"text","marks":[{"type":"italic"}]}]},{"type":"paragraph","content":[{"text":"Plain text logs are uncorrelatable. Structured JSON logs with a known schema are queryable, alertable, and aggregatable. Adopt a logging library that emits JSON by default, define 5-10 standard fields (timestamp, level, request_id, user_id, route, latency_ms), and prohibit free-form messages from carrying load-bearing data.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Tasks","type":"text"}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Pick a structured-logging lib for your language (pino in Node, zap in Go, structlog in Python)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Define a JSON schema with 5-10 standard fields every log line carries","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Add request_id as a header that propagates through every middleware and downstream call","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Replace string-template log lines with structured fields ('user logged in' becomes ","type":"text"},{"text":"{ event: 'login', user_id, method }","type":"text","marks":[{"type":"code"}]},{"text":")","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Verify a sample of logs in the backend and confirm they're queryable by user_id","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Pointers","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Official]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"OpenTelemetry log specification","type":"text","marks":[{"type":"link","attrs":{"href":"https://opentelemetry.io/docs/specs/otel/logs/"}}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Code]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"Pino (Node.js logger)","type":"text","marks":[{"type":"link","attrs":{"href":"https://github.com/pinojs/pino"}}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Guide]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"Twelve-Factor App: logs","type":"text","marks":[{"type":"link","attrs":{"href":"https://12factor.net/logs"}}]}]}]}]},{"type":"callout","attrs":{"variant":"caution"},"content":[{"type":"paragraph","content":[{"text":"Gotchas","type":"text","marks":[{"type":"bold"}]}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Don't log full request bodies. Production logs leak PII into your vendor and your retention policy. Log the request shape, not the payload.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Free-form string logs are an anti-pattern at scale. The day you need to query 'all 500s on /checkout in the last hour by user', regex over strings is a 3-hour grep, structured fields are a 30-second query.","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Agent prompt for this step","type":"text"}]},{"type":"codeBlock","attrs":{"language":"text"},"content":[{"text":"Read the codebase. Find every log call (console.log, logger.info, log.Info, etc.).\n\nFor each, propose a refactor that:\n1. Uses the team's structured-logging library (ask the user which one).\n2. Carries the standard fields (timestamp, level, request_id, user_id, route, event_name).\n3. Replaces string templates with structured fields (\"user logged in\" becomes event: \"login\" + user_id).\n\nOutput as a list of file:line references with proposed replacements. Flag any log that contains PII (email, full name, IP) for review under your data classification.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 3: Capture the four golden signals as metrics","type":"text"}]},{"type":"paragraph","content":[{"text":"Estimated time: Half a day","type":"text","marks":[{"type":"italic"}]}]},{"type":"paragraph","content":[{"text":"Latency, traffic, errors, saturation. Every service needs them. Most observability tools give them to you for free if you instrument correctly: latency from request duration histograms, traffic from request counters, errors from status code counters, saturation from CPU / memory / queue depth. Build a dashboard with these four panels per service and call it the home base.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Tasks","type":"text"}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Add a request-duration histogram (p50, p95, p99) keyed by route + status_code","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Add a request-count counter keyed by route + status_code","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Add an error-count counter for 5xx responses","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Add saturation metrics: CPU, memory, max DB connection pool, max queue depth","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Build a 4-panel dashboard per service: Latency, Traffic, Errors, Saturation","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Link the dashboard URL in your team docs","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Pointers","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Guide]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"Google SRE: four golden signals","type":"text","marks":[{"type":"link","attrs":{"href":"https://sre.google/sre-book/monitoring-distributed-systems/"}}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Official]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"Prometheus: histogram quantiles","type":"text","marks":[{"type":"link","attrs":{"href":"https://prometheus.io/docs/practices/histograms/"}}]}]}]}]},{"type":"callout","attrs":{"variant":"caution"},"content":[{"type":"paragraph","content":[{"text":"Gotchas","type":"text","marks":[{"type":"bold"}]}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Latency p99 over a 1-minute window is noisy with low traffic. Use 5-min windows for p99 if your service does under 100 req/s.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Don't use averages for latency. Average latency hides the long tail. Always p50 / p95 / p99.","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 4: Wire up distributed tracing with OpenTelemetry","type":"text"}]},{"type":"paragraph","content":[{"text":"Estimated time: 1 day","type":"text","marks":[{"type":"italic"}]}]},{"type":"paragraph","content":[{"text":"Tracing is the single biggest investigative tool you can add. A trace shows the path of one request through every service it touches, with timing per hop. OpenTelemetry's auto-instrumentation libraries cover most popular frameworks; you usually only need to add 5-10 manual spans for the important business logic.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Tasks","type":"text"}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Install the OpenTelemetry SDK + exporter for your language","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Enable auto-instrumentation for HTTP server, HTTP client, DB driver","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Add manual spans around the 5 most important business operations (checkout, signup, cron job)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Configure the exporter to send to your chosen backend (OTLP endpoint)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Verify a request shows up as a complete trace in the backend, with all hops","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Pointers","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Official]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"OpenTelemetry: getting started","type":"text","marks":[{"type":"link","attrs":{"href":"https://opentelemetry.io/docs/getting-started/"}}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Official]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"OpenTelemetry: auto-instrumentation","type":"text","marks":[{"type":"link","attrs":{"href":"https://opentelemetry.io/docs/zero-code/"}}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Official]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"OTLP protocol spec","type":"text","marks":[{"type":"link","attrs":{"href":"https://opentelemetry.io/docs/specs/otlp/"}}]}]}]}]},{"type":"callout","attrs":{"variant":"caution"},"content":[{"type":"paragraph","content":[{"text":"Gotchas","type":"text","marks":[{"type":"bold"}]}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"OpenTelemetry's default sample rate is often too low to be useful. For a small app, sample 100% of traces. Drop to head-based sampling at 1% only when volume justifies it.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Manual spans without ","type":"text"},{"text":"try/finally","type":"text","marks":[{"type":"code"}]},{"text":" (or context manager) leak when an exception fires. Use the SDK's recommended pattern for your language.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Trace context propagation breaks across queue boundaries (SQS, Kafka, Redis pub/sub) unless you serialize the trace headers in the message envelope.","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Agent prompt for this step","type":"text"}]},{"type":"codeBlock","attrs":{"language":"text"},"content":[{"text":"Read the codebase and propose 10 manual span instrumentations.\n\nFor each:\n1. The function or block to wrap.\n2. The span name (use convention: \"service.operation\", e.g. \"checkout.charge_card\").\n3. The attributes to set on the span (user_id, order_id, amount, etc.).\n\nPrioritize: payment flows, signup, third-party API calls, slow DB queries, background jobs. Skip pure-CPU functions.\n\nOutput as a list with file:line references and a code snippet showing the instrumentation in the team's language.","type":"text"}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 5: Define one or two SLOs the team actually believes in","type":"text"}]},{"type":"paragraph","content":[{"text":"Estimated time: Half a day to define, ongoing to refine","type":"text","marks":[{"type":"italic"}]}]},{"type":"paragraph","content":[{"text":"An SLO is a target for how often the service does the right thing. The classic shape: '99.9% of /checkout requests complete in under 500ms over a rolling 30 days.' That gives you an error budget (0.1% over 30 days = ~43 minutes). When you burn through the budget, you slow feature velocity. When you have lots of budget, you ship aggressively. SLOs only work if the team treats them as real.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Tasks","type":"text"}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Pick the 1-2 user-facing flows that matter most (signup, checkout, primary product action)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"For each, write the SLO: 'X% of [request type] complete in under [latency] / without error, over [window]'","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Calculate the error budget (1 - target = budget; over the window = absolute minutes/requests)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Set up the metric query that measures it; verify the historical baseline meets the target","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Decide the policy: what do we DO when we burn budget? (slow features, freeze deploys, etc.)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Pointers","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Guide]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"Google SRE Workbook: SLO chapter","type":"text","marks":[{"type":"link","attrs":{"href":"https://sre.google/workbook/implementing-slos/"}}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Tool]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"Sloth: SLOs as code","type":"text","marks":[{"type":"link","attrs":{"href":"https://sloth.dev/"}}]}]}]}]},{"type":"callout","attrs":{"variant":"caution"},"content":[{"type":"paragraph","content":[{"text":"Gotchas","type":"text","marks":[{"type":"bold"}]}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Don't pick 99.99% just because it sounds good. A 99.99% SLO over 30 days is 4 minutes of error budget. Most small teams can't operate at that level without dedicated SRE.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"An SLO without a written 'what we do when we burn budget' policy is decoration. Write the policy.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"SLOs measured against availability without latency miss the silent slowdown. Always pair availability with latency.","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 6: Set up alerts on burn rate, not on threshold","type":"text"}]},{"type":"paragraph","content":[{"text":"Estimated time: 2-3 hr","type":"text","marks":[{"type":"italic"}]}]},{"type":"paragraph","content":[{"text":"Threshold alerts ('latency over 500ms') page constantly on noise. Burn-rate alerts ('we'll exhaust this month's error budget in 6 hours at the current rate') page when something is actually breaking. Multi-window burn rate (fast: 1 hour, slow: 6 hours) catches both fast outages and slow degradations.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Tasks","type":"text"}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"For each SLO, configure a fast-burn alert (5% of monthly budget in 1 hour) - pages on-call immediately","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Configure a slow-burn alert (10% of monthly budget in 6 hours) - opens a ticket, no page","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Route alerts to a single on-call rotation (PagerDuty / Opsgenie / on-call-bot in Slack)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Run a synthetic alert drill: trigger a fake burn, verify the page reaches the on-call","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Document the response runbook for each alert","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Pointers","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Guide]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"Google SRE: alerting on SLOs","type":"text","marks":[{"type":"link","attrs":{"href":"https://sre.google/workbook/alerting-on-slos/"}}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Guide]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"PagerDuty: alert design","type":"text","marks":[{"type":"link","attrs":{"href":"https://www.pagerduty.com/resources/learn/alert-fatigue/"}}]}]}]}]},{"type":"callout","attrs":{"variant":"caution"},"content":[{"type":"paragraph","content":[{"text":"Gotchas","type":"text","marks":[{"type":"bold"}]}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Alerts on individual host CPU, individual instance latency, etc. are an anti-pattern in 2024. Alert on user-facing SLOs, investigate via dashboards.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"If your on-call is woken up more than once a quarter for something that didn't actually impact users, your alerts are wrong. Tune them down.","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 7: Build the four runbooks: latency, errors, saturation, third-party down","type":"text"}]},{"type":"paragraph","content":[{"text":"Estimated time: Half a day","type":"text","marks":[{"type":"italic"}]}]},{"type":"paragraph","content":[{"text":"Runbooks turn a 2 AM page into a 15-minute fix instead of a 2-hour panic. Write four for the most common failure modes: high latency, error rate spike, saturation (out-of-memory / connection pool exhausted), third-party dependency down. Each is a numbered list of 'first 5 minutes' steps.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Tasks","type":"text"}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Latency runbook: which dashboard to open, which trace to look at, common causes (cold start, slow DB query, third-party slowdown)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Errors runbook: which logs to query, how to find the deploying commit, the rollback procedure","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Saturation runbook: which metric to check (memory, DB connections, queue depth), how to scale, how to drain a stuck queue","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Third-party-down runbook: which third party (auth provider, payment processor, email provider), the status page URL, the in-app degradation behavior","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Link each runbook from the matching alert","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Pointers","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Guide]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"Google SRE: runbook style guide","type":"text","marks":[{"type":"link","attrs":{"href":"https://sre.google/workbook/runbooks/"}}]}]}]}]},{"type":"callout","attrs":{"variant":"caution"},"content":[{"type":"paragraph","content":[{"text":"Gotchas","type":"text","marks":[{"type":"bold"}]}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Runbooks rot when no one runs them. Run a tabletop drill quarterly: pick a runbook, execute every step, find the broken link or stale screenshot, fix it.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Runbooks that just say 'investigate' aren't runbooks. They need specific dashboards, specific commands, specific decision trees.","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 8: Run a postmortem on the next real incident","type":"text"}]},{"type":"paragraph","content":[{"text":"Estimated time: 2-4 hr per incident","type":"text","marks":[{"type":"italic"}]}]},{"type":"paragraph","content":[{"text":"Postmortems compound. The first one feels like overhead; the tenth one is your team's most valuable institutional document. Write the postmortem within 48 hours of the incident, blameless tone, focused on the system gaps not the individual mistakes.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Tasks","type":"text"}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Within 48 hr: write the postmortem in a shared doc, blameless tone","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Sections: timeline, impact, root cause, contributing factors, what worked, what didn't, action items","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"For each action item: name a single owner and a due date","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Review the postmortem in a 30-min team meeting; capture follow-up questions","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Track action items in your team's tracker until done","type":"text"}]}]}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Pointers","type":"text"}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Guide]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"Google SRE: postmortem culture","type":"text","marks":[{"type":"link","attrs":{"href":"https://sre.google/sre-book/postmortem-culture/"}}]}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"[Guide]","type":"text","marks":[{"type":"bold"}]},{"text":" ","type":"text"},{"text":"Etsy debriefing facilitation guide","type":"text","marks":[{"type":"link","attrs":{"href":"https://extfiles.etsy.com/DebriefingFacilitationGuide.pdf"}}]}]}]}]},{"type":"callout","attrs":{"variant":"caution"},"content":[{"type":"paragraph","content":[{"text":"Gotchas","type":"text","marks":[{"type":"bold"}]}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Blameful postmortems destroy team trust. The same human error in a system that allowed it through is a system bug, not a person bug.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Action items without owners or dates are decorative. Track them like P0 work; close them or formally drop them.","type":"text"}]}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 9: Review the dashboard weekly; iterate on signal vs noise","type":"text"}]},{"type":"paragraph","content":[{"text":"Estimated time: 30 min/week ongoing","type":"text","marks":[{"type":"italic"}]}]},{"type":"paragraph","content":[{"text":"Observability is not done; it drifts. Set a weekly 30-min slot to walk through the dashboards, check SLO health, prune alerts that paged-but-weren't-real, add metrics for the new things you shipped. The team that does this is 10x better at responding to incidents in 12 months than the team that doesn't.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Tasks","type":"text"}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Pick a weekly recurring slot (Monday morning works for most teams)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Review SLO compliance over the last 7 days; flag any near burn","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Review every page from the last 7 days; ask 'was this real?'","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"For each false page, tune the alert (raise threshold, change window, mute by tag)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"For each new feature shipped, add the metric or trace span that lets you debug it","type":"text"}]}]}]},{"type":"callout","attrs":{"variant":"caution"},"content":[{"type":"paragraph","content":[{"text":"Gotchas","type":"text","marks":[{"type":"bold"}]}]},{"type":"bulletList","content":[{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"Most teams skip the weekly review for 'busier' work. Six months later they have 200 alerts firing weekly and on-call is hated. Don't skip it.","type":"text"}]}]},{"type":"listItem","content":[{"type":"paragraph","content":[{"text":"If you can't reproduce the metric query for an alert by hand, the alert is not maintainable. Document the query in the runbook.","type":"text"}]}]}]}]},{"type":"horizontalRule"},{"type":"heading","attrs":{"level":2},"content":[{"text":"Hand the template to your agent","type":"text"}]},{"type":"paragraph","content":[{"text":"Paste the prompt below into your agent's permanent system prompt so the agent reads, writes, and maintains this workspace as you work through the steps.","type":"text"}]},{"type":"codeBlock","attrs":{"language":"text"},"content":[{"text":"You are an agent on the \"Set up observability\" template workspace.\n\nYour role: maintain the four surfaces (Steps, Pointers, Brief, SLO log) as the team rolls out observability.\n\nCadence:\n- When a step is marked Done, append to the Brief doc what shipped (schema, dashboard URL, runbook link).\n- When the user defines an SLO, capture it as a row in SLO log with target / window / error budget / alert link.\n- When an incident happens, link the postmortem to the relevant SLO row in SLO log.\n\nFirst MCP tool calls:\n1. list_surfaces(workspace_slug=\"set-up-observability\")\n2. list_rows(workspace_slug=\"set-up-observability\", surface_slug=\"steps\")\n3. get_doc(workspace_slug=\"set-up-observability\", surface_slug=\"brief\")\n\nWhen proposing instrumentation, always read the user's codebase first before suggesting span names - they should match real code paths.","type":"text"}]},{"type":"horizontalRule"},{"type":"heading","attrs":{"level":2},"content":[{"text":"FAQ","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Do I need all three pillars (logs, metrics, traces) on day one?","type":"text"}]},{"type":"paragraph","content":[{"text":"Structured logs and the four golden signals as metrics, yes. Traces can wait 1-2 sprints if you have a small monolith - logs cover most of what traces would tell you. Once you have multiple services or async work (queues, cron, background jobs), traces become essential because logs alone can't reconstruct the request path.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"What does observability cost for a small team?","type":"text"}]},{"type":"paragraph","content":[{"text":"Grafana Cloud's free tier covers most teams under ~10 services: 10k metrics, 50GB logs, 50GB traces per month. Sentry's free Developer tier handles 5k errors. OpenTelemetry itself is free open source. The first paid step is usually $19-50/mo when you outgrow the free tier - still cheaper than a single hour of debugging a production outage blind.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Why OpenTelemetry instead of a vendor SDK?","type":"text"}]},{"type":"paragraph","content":[{"text":"Vendor lock-in. OpenTelemetry is the W3C-standard instrumentation; switching backends becomes a config change, not a rewrite. Most major vendors (Honeycomb, Datadog, Grafana, AWS X-Ray) accept OTLP natively. Vendor SDKs are sometimes more polished but tie you to one provider's roadmap and pricing.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"How many alerts should we have?","type":"text"}]},{"type":"paragraph","content":[{"text":"Far fewer than most teams have. A small team should have 5-10 alerts max: one or two SLO burn-rate alerts per critical user flow, plus saturation alerts for the few resources that can hard-cap the service (memory, DB connections). If you have 50 alerts firing weekly, you have 50 alerts your team has learned to ignore.","type":"text"}]},{"type":"heading","attrs":{"level":3},"content":[{"text":"Can my AI agents help maintain the observability stack?","type":"text"}]},{"type":"paragraph","content":[{"text":"Yes. Agents are useful for: drafting the structured-logging schema from existing log calls, proposing trace spans by reading the codebase, refreshing dashboards when new metrics are added, summarising weekly SLO health, and triaging which incidents deserve a full postmortem vs a one-line note. The template ships agent prompts inline for the logging refactor and trace instrumentation steps.","type":"text"}]}]},"initialSurfaces":[{"id":"cmomexy5h003l04l8ntw9lof8","kind":"doc","name":"Observability plan","slug":"brief"},{"id":"cmomexyp8003o04l80e8uvi26","kind":"table","name":"Steps","slug":"steps"},{"id":"cmomexz1t003q04l8okax4fbk","kind":"table","name":"Pointers","slug":"pointers"},{"id":"cmomexzr2008z04jltlv30tp3","kind":"table","name":"SLO log","slug":"slo-log"},{"id":"cmooqoeny000p04jypl2z8uc2","kind":"doc","name":"Observability checklist","slug":"checklist"}],"publicSurfaceContents":[{"slug":"brief","content":"$1f:1:props:initialDocContent"},{"slug":"checklist","content":{"type":"doc","content":[{"type":"heading","attrs":{"level":1},"content":[{"text":"Set up observability: logs, metrics, traces: full checklist","type":"text"}]},{"type":"blockquote","content":[{"type":"paragraph","content":[{"text":"Tick items as you ship. The plan is in ","type":"text"},{"text":"Steps","type":"text","marks":[{"type":"bold"}]},{"text":"; this surface is the literal \"did I do this?\" list. Every box maps to a task in a step.","type":"text"}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 1: Pick a vendor before you instrument anything","type":"text"}]},{"type":"paragraph","content":[{"text":"2-3 hr of research","type":"text","marks":[{"type":"italic"}]}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"List your top 3 budget constraints (monthly spend, team size, on-call sophistication)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Compare three options: Grafana Cloud (cheapest), Honeycomb (best for traces), Datadog (best UX, priciest)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Estimate volume: log events per day, metrics cardinality, trace spans per second","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Sign up for the free tier of your pick and start there","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 2: Switch to structured logging (JSON, with a schema)","type":"text"}]},{"type":"paragraph","content":[{"text":"Half a day","type":"text","marks":[{"type":"italic"}]}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Pick a structured-logging lib for your language (pino in Node, zap in Go, structlog in Python)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Define a JSON schema with 5-10 standard fields every log line carries","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Add request_id as a header that propagates through every middleware and downstream call","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Replace string-template log lines with structured fields ('user logged in' becomes ","type":"text"},{"text":"{ event: 'login', user_id, method }","type":"text","marks":[{"type":"code"}]},{"text":")","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Verify a sample of logs in the backend and confirm they're queryable by user_id","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 3: Capture the four golden signals as metrics","type":"text"}]},{"type":"paragraph","content":[{"text":"Half a day","type":"text","marks":[{"type":"italic"}]}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Add a request-duration histogram (p50, p95, p99) keyed by route + status_code","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Add a request-count counter keyed by route + status_code","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Add an error-count counter for 5xx responses","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Add saturation metrics: CPU, memory, max DB connection pool, max queue depth","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Build a 4-panel dashboard per service: Latency, Traffic, Errors, Saturation","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Link the dashboard URL in your team docs","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 4: Wire up distributed tracing with OpenTelemetry","type":"text"}]},{"type":"paragraph","content":[{"text":"1 day","type":"text","marks":[{"type":"italic"}]}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Install the OpenTelemetry SDK + exporter for your language","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Enable auto-instrumentation for HTTP server, HTTP client, DB driver","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Add manual spans around the 5 most important business operations (checkout, signup, cron job)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Configure the exporter to send to your chosen backend (OTLP endpoint)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Verify a request shows up as a complete trace in the backend, with all hops","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 5: Define one or two SLOs the team actually believes in","type":"text"}]},{"type":"paragraph","content":[{"text":"Half a day to define, ongoing to refine","type":"text","marks":[{"type":"italic"}]}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Pick the 1-2 user-facing flows that matter most (signup, checkout, primary product action)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"For each, write the SLO: 'X% of [request type] complete in under [latency] / without error, over [window]'","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Calculate the error budget (1 - target = budget; over the window = absolute minutes/requests)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Set up the metric query that measures it; verify the historical baseline meets the target","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Decide the policy: what do we DO when we burn budget? (slow features, freeze deploys, etc.)","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 6: Set up alerts on burn rate, not on threshold","type":"text"}]},{"type":"paragraph","content":[{"text":"2-3 hr","type":"text","marks":[{"type":"italic"}]}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"For each SLO, configure a fast-burn alert (5% of monthly budget in 1 hour) - pages on-call immediately","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Configure a slow-burn alert (10% of monthly budget in 6 hours) - opens a ticket, no page","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Route alerts to a single on-call rotation (PagerDuty / Opsgenie / on-call-bot in Slack)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Run a synthetic alert drill: trigger a fake burn, verify the page reaches the on-call","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Document the response runbook for each alert","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 7: Build the four runbooks: latency, errors, saturation, third-party down","type":"text"}]},{"type":"paragraph","content":[{"text":"Half a day","type":"text","marks":[{"type":"italic"}]}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Latency runbook: which dashboard to open, which trace to look at, common causes (cold start, slow DB query, third-party slowdown)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Errors runbook: which logs to query, how to find the deploying commit, the rollback procedure","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Saturation runbook: which metric to check (memory, DB connections, queue depth), how to scale, how to drain a stuck queue","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Third-party-down runbook: which third party (auth provider, payment processor, email provider), the status page URL, the in-app degradation behavior","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Link each runbook from the matching alert","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 8: Run a postmortem on the next real incident","type":"text"}]},{"type":"paragraph","content":[{"text":"2-4 hr per incident","type":"text","marks":[{"type":"italic"}]}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Within 48 hr: write the postmortem in a shared doc, blameless tone","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Sections: timeline, impact, root cause, contributing factors, what worked, what didn't, action items","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"For each action item: name a single owner and a due date","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Review the postmortem in a 30-min team meeting; capture follow-up questions","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Track action items in your team's tracker until done","type":"text"}]}]}]},{"type":"heading","attrs":{"level":2},"content":[{"text":"Step 9: Review the dashboard weekly; iterate on signal vs noise","type":"text"}]},{"type":"paragraph","content":[{"text":"30 min/week ongoing","type":"text","marks":[{"type":"italic"}]}]},{"type":"taskList","content":[{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Pick a weekly recurring slot (Monday morning works for most teams)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Review SLO compliance over the last 7 days; flag any near burn","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"Review every page from the last 7 days; ask 'was this real?'","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"For each false page, tune the alert (raise threshold, change window, mute by tag)","type":"text"}]}]},{"type":"taskItem","attrs":{"checked":false},"content":[{"type":"paragraph","content":[{"text":"For each new feature shipped, add the metric or trace span that lets you debug it","type":"text"}]}]}]}]}}],"publicTableContents":[{"slug":"steps","columns":"$1f:1:props:initialWorkspace:columns","rows":[{"id":"cmomey0ue008204jumd7yof7g","position":1,"data":{"notes":"","title":"Pick a vendor before you instrument anything","status":"queued","estimate":"2-3 hr of research"}},{"id":"cmomey11u008504jujwnjwze2","position":2,"data":{"notes":"","title":"Switch to structured logging (JSON, with a schema)","status":"queued","estimate":"Half a day"}},{"id":"cmomey182008804juov5btlm9","position":3,"data":{"notes":"","title":"Capture the four golden signals as metrics","status":"queued","estimate":"Half a day"}},{"id":"cmomey1eg008b04juv8eegmtv","position":4,"data":{"notes":"","title":"Wire up distributed tracing with OpenTelemetry","status":"queued","estimate":"1 day"}},{"id":"cmomey1m8008e04juxcjdl48x","position":5,"data":{"notes":"","title":"Define one or two SLOs the team actually believes in","status":"queued","estimate":"Half a day to define, ongoing to refine"}},{"id":"cmomey1st008h04juanyd9f0o","position":6,"data":{"notes":"","title":"Set up alerts on burn rate, not on threshold","status":"queued","estimate":"2-3 hr"}},{"id":"cmomey20q008k04ju7wwblsql","position":7,"data":{"notes":"","title":"Build the four runbooks: latency, errors, saturation, third-party down","status":"queued","estimate":"Half a day"}},{"id":"cmomey26t008n04juxicq9qop","position":8,"data":{"notes":"","title":"Run a postmortem on the next real incident","status":"queued","estimate":"2-4 hr per incident"}},{"id":"cmomey2fe008q04jusf63nq5t","position":9,"data":{"notes":"","title":"Review the dashboard weekly; iterate on signal vs noise","status":"queued","estimate":"30 min/week ongoing"}}]},{"slug":"pointers","columns":[{"key":"label","type":"text","label":"Label","position":0},{"key":"url","type":"text","label":"URL","position":1},{"key":"kind","type":"status","label":"Kind","options":[{"color":"#0A84FF","label":"official","value":"official"},{"color":"#22C55E","label":"guide","value":"guide"},{"color":"#BF5AF2","label":"tool","value":"tool"},{"color":"#F5B842","label":"community","value":"community"},{"color":"#FF2D92","label":"code","value":"code"}],"position":2},{"key":"step","type":"text","label":"Step","position":3},{"key":"why","type":"text","label":"Why","position":4}],"rows":[{"id":"cmomey2uy008t04ju44vx6scg","position":1,"data":{"url":"https://opentelemetry.io/ecosystem/vendors/","why":"","kind":"official","step":"Pick a vendor before you instrument anything","label":"OpenTelemetry vendor list"}},{"id":"cmomey34g008w04juft4wdg22","position":2,"data":{"url":"https://www.honeycomb.io/pricing","why":"","kind":"official","step":"Pick a vendor before you instrument anything","label":"Honeycomb pricing"}},{"id":"cmomey3az008z04juqub91ly4","position":3,"data":{"url":"https://grafana.com/pricing/","why":"","kind":"official","step":"Pick a vendor before you instrument anything","label":"Grafana Cloud pricing"}},{"id":"cmomey3hb009204ju767sgxbu","position":4,"data":{"url":"https://opentelemetry.io/docs/specs/otel/logs/","why":"","kind":"official","step":"Switch to structured logging (JSON, with a schema)","label":"OpenTelemetry log specification"}},{"id":"cmomey3nu003s04l8z74shkpr","position":5,"data":{"url":"https://github.com/pinojs/pino","why":"","kind":"code","step":"Switch to structured logging (JSON, with a schema)","label":"Pino (Node.js logger)"}},{"id":"cmomey3uv003v04l8w335wkon","position":6,"data":{"url":"https://12factor.net/logs","why":"","kind":"guide","step":"Switch to structured logging (JSON, with a schema)","label":"Twelve-Factor App: logs"}},{"id":"cmomey42l003y04l8sm32ggkf","position":7,"data":{"url":"https://sre.google/sre-book/monitoring-distributed-systems/","why":"","kind":"guide","step":"Capture the four golden signals as metrics","label":"Google SRE: four golden signals"}},{"id":"cmomey49h004104l83833n5jz","position":8,"data":{"url":"https://prometheus.io/docs/practices/histograms/","why":"","kind":"official","step":"Capture the four golden signals as metrics","label":"Prometheus: histogram quantiles"}},{"id":"cmomey4g4004404l8dbmhj3or","position":9,"data":{"url":"https://opentelemetry.io/docs/getting-started/","why":"","kind":"official","step":"Wire up distributed tracing with OpenTelemetry","label":"OpenTelemetry: getting started"}},{"id":"cmomey4m5008t04l82uouhb7e","position":10,"data":{"url":"https://opentelemetry.io/docs/zero-code/","why":"","kind":"official","step":"Wire up distributed tracing with OpenTelemetry","label":"OpenTelemetry: auto-instrumentation"}},{"id":"cmomey4vf008w04l81c7yyqrh","position":11,"data":{"url":"https://opentelemetry.io/docs/specs/otlp/","why":"","kind":"official","step":"Wire up distributed tracing with OpenTelemetry","label":"OTLP protocol spec"}},{"id":"cmomey53a008z04l8rp5sz0f2","position":12,"data":{"url":"https://sre.google/workbook/implementing-slos/","why":"","kind":"guide","step":"Define one or two SLOs the team actually believes in","label":"Google SRE Workbook: SLO chapter"}},{"id":"cmomey59j009204l834mjrrnu","position":13,"data":{"url":"https://sloth.dev/","why":"","kind":"tool","step":"Define one or two SLOs the team actually believes in","label":"Sloth: SLOs as code"}},{"id":"cmomey5gd009504l8fg4evx6e","position":14,"data":{"url":"https://sre.google/workbook/alerting-on-slos/","why":"","kind":"guide","step":"Set up alerts on burn rate, not on threshold","label":"Google SRE: alerting on SLOs"}},{"id":"cmomey5o1009804l8oj136o7a","position":15,"data":{"url":"https://www.pagerduty.com/resources/learn/alert-fatigue/","why":"","kind":"guide","step":"Set up alerts on burn rate, not on threshold","label":"PagerDuty: alert design"}},{"id":"cmomey5ui009b04l8ashvhetv","position":16,"data":{"url":"https://sre.google/workbook/runbooks/","why":"","kind":"guide","step":"Build the four runbooks: latency, errors, saturation, third-part","label":"Google SRE: runbook style guide"}},{"id":"cmomey63f009e04l8jhkw4y5u","position":17,"data":{"url":"https://sre.google/sre-book/postmortem-culture/","why":"","kind":"guide","step":"Run a postmortem on the next real incident","label":"Google SRE: postmortem culture"}},{"id":"cmomey69b009h04l8hwzzrh2w","position":18,"data":{"url":"https://extfiles.etsy.com/DebriefingFacilitationGuide.pdf","why":"","kind":"guide","step":"Run a postmortem on the next real incident","label":"Etsy debriefing facilitation guide"}}]},{"slug":"slo-log","columns":[{"key":"title","type":"text","label":"Title","position":0},{"key":"status","type":"status","label":"Status","options":[{"color":"#94A6BD","label":"queued","value":"queued"},{"color":"#0A84FF","label":"active","value":"active"},{"color":"#22C55E","label":"done","value":"done"}],"position":1},{"key":"notes","type":"text","label":"Notes","position":2}],"rows":[{"id":"cmonjzb56003a04ibsclxzscn","position":1,"data":{"date":"2026-05-04","notes":"Availability: 99.92% (target 99.9%). Within budget. p99: 187ms (target <200ms).","status":"active","version":"Q2 2026"}},{"id":"cmonjzbg5003d04ib0awjivs5","position":2,"data":{"date":"2026-05-15","notes":"Availability dropped to 99.7% — 30min outage from DB pool. Investigating.","status":"high","version":"Q2 2026"}},{"id":"cmoot7j5s001604l2aluy6hrz","position":3,"data":{"date":"2026-06-01","notes":"Availability: 99.95%. p99: 178ms. Error rate: 0.04%. All within budget.","status":"active","version":"Q2 2026 mid"}},{"id":"cmoot7kcp000004l1rntpx0e1","position":4,"data":{"date":"2026-06-15","notes":"Availability dipped to 99.8% — 60min outage Stripe webhook delays. Investigating.","status":"high","version":"Q2 2026 late"}},{"id":"cmoot7klv001904l2i9yly60f","position":5,"data":{"date":"2026-07-01","notes":"Reset SLO budget. Targets: 99.9% availability, p99 <200ms, error rate <0.1%.","status":"active","version":"Q3 2026 baseline"}}]}],"initialActiveSurfaceSlug":"brief","publicForkSourceUrl":"https://trydock.ai/templates/set-up-observability/raw.md"}]]