# llm_with_tracing.py
# Purpose: A production-ready LLM call wrapper with full overview.
# Every call in Langfuse is tracked: input, output, token, cost, latency.
#
# Prerequisites:
# pip install langfuse anthropic python-dotenv
#
# to install:
#1. Create a free account at https://cloud.langfuse.com
#2. Get your keys from Settings > API Keys
#3. Create a .env file with the variables given below
#
# run:
# Python llm_with_tracing.py
Import os
Import Time
From dotenv Import load_dotenv
Import anthropic
From Langfuse Import Langfuse
# Load environment variables from .env file
load_dotenv()
# Required environment variables in your .env:
# LANGFUSE_PUBLIC_KEY=pk-lf-…
# LANGFUSE_SECRET_KEY=sk-lf-…
# LANGFUSE_HOST=https://cloud.langfuse.com (or your self-hosted URL)
# anthropic_api_key=sk-ant-…
# initialize clients
langfuse_client = Langfuse() # Reads keys from the environment automatically
anthropic_client = anthropic.anthropic() # Reads ANTHROPIC_API_KEY from environment
# ──Configuration ──────────────────────── ──────────────────────────
# Store your prompt here, not inline in the API call.
# This makes it freely versionable and testable.
system_prompt = “”“You are a helpful customer support assistant.
Answer questions clearly and concisely.
If you don’t know something, say it straight away – don’t guess.”“”
Sample = “cloud-sonnet-4-20250514”
# Pricing of Anthropic until mid-2026 (update if price changes)
# Used to calculate cost per call for cost tracking
COST_PER_INPUT_TOKEN = 3.00 / 1_000_000 # $3.00 per million input tokens
COST_PER_OUTPUT_TOKEN = 15.00 / 1_000_000 # $15.00 per million output tokens
def call_llm_with_tracing(
user_message: : STR,
session ID: : STR = “default-session”,
user ID: : STR = “Anonymous”
) -> STR: :
“”“
Call a traced LLM. Each call creates a languagefuse trace:
– Full input and output
– Token usage (input, output, total)
– Cost calculated in USD
– latency in milliseconds
– Used model and session context
Parameters:
user_message : user’s message
session_id : Groups related calls into one conversation in Langfuse
user_id : Associates the call to a specific user for analytics
Return:
llm response as a string
““”
# Create a top-level trace for this user interaction
# The trace appears as a unit of work in the Langfuse dashboard
trace = langfuse_client.trace(
Name=“customer-support-call”,
session ID=session ID,
user ID=user ID,
input={“user_message”: : user_message, “system_prompt”: : system_prompt}
)
# Create a generation period inside the trace
# This captures model-specific details: model name, token, cost
generation = trace.generation(
Name=“Cloud-Closing”,
Sample=Sample,
input={
“System”: : system_prompt,
“Message”: : ({“Role”: : “user”, “Material”: : user_message})
}
)
start time = Time.Time()
Effort: :
# call api
feedback = anthropic_client.messages.create(
Sample=Sample,
max_tokens=1024,
System=system_prompt,
messages=({“Role”: : “user”, “Material”: : user_message})
)
latency_ms = int here((Time.Time() – start time) * 1000)
# Remove response text
response_text = feedback.Material(0).Basic lesson
# Remove token usage from response
input_token = feedback.Use.input_token
output_token = feedback.Use.output_token
total_tokens = input_token + output_token
# Calculate the cost of this call
cost_USD = (
input_token* COST_PER_INPUT_TOKEN +
output_token* COST_PER_OUTPUT_token
)
# Update the generation period with the results
# This data populates the Langfuse cost and token dashboard
generation.Ending(
output=response_text,
Use={
“input”: : input_token,
“Output”: : output_token,
“Total”: : total_tokens,
“Unit”: : “token”
},
metadata={
“latency_ms”: : latency_ms,
“cost_usd”: : Round(cost_USD, 6),
“Sample”: : Sample
}
)
# Update the trace with the last output
trace.update(
output={“feedback”: : response_text},
metadata={“total_cost_USD”: : Round(cost_USD, 6)}
)
# Print a summary to stdout for local visibility
printing(F“n{‘─’ * 60}”)
printing(F“User:{user_message}”)
printing(F“Cloud: {response_text}”)
printing(F“Tokens: {input_tokens} in / {output_tokens} out / {total_tokens} total”)
printing(F“Cost: ${cost_usd:.6f}”)
printing(F“latency:{latency_ms}ms”)
printing(F“trace: {langfuse_client.base_url}/trace/{trace.id}”)
printing(F“{‘─’ * 60}n”)
return response_text
except Exception As E: :
# Record the error in the trace so that it appears in Langfuse
generation.Ending(
output=nobody,
metadata={“Mistake”: : STR(E), “latency_ms”: : int here((Time.Time() – start time) * 1000)}
)
trace.update(output={“Mistake”: : STR(E)})
# Always flush before raising – this ensures the error trace is sent
langfuse_client.redness()
Raise
At the end: :
# flush sends all buffered events to langfuse
# In long running service, the Langfuse is automatically flushed.
# In a script, you must manually flush before the process ends.
langfuse_client.redness()
# ─ Run a demo
If __Name__ == “__main__”: :
# Simulate two stages of a customer support conversation
test message = (
“What is your return policy for electronics?”,
“Can I return an item purchased 45 days ago?”
)
Session = “demo-session-001”
For I, Message In tell one by one(test message): :
printing(F“ncall {i + 1}/{len(test_messages)}”)
Effort: :
call_llm_with_tracing(
user_message=Message,
session ID=Session,
user ID=“test-user-42”
)
except Exception As E: :
printing(F“Error on call {i + 1}: {e}”)