Roadmap to mastering LLMops in 2026

by ai-intensify
0 comments
Roadmap to mastering LLMops in 2026

# llm_with_tracing.py

# Purpose: A production-ready LLM call wrapper with full overview.

# Every call in Langfuse is tracked: input, output, token, cost, latency.

#

# Prerequisites:

# pip install langfuse anthropic python-dotenv

#

# to install:

#1. Create a free account at https://cloud.langfuse.com

#2. Get your keys from Settings > API Keys

#3. Create a .env file with the variables given below

#

# run:

# Python llm_with_tracing.py

Import os

Import Time

From dotenv Import load_dotenv

Import anthropic

From Langfuse Import Langfuse

# Load environment variables from .env file

load_dotenv()

# Required environment variables in your .env:

# LANGFUSE_PUBLIC_KEY=pk-lf-…

# LANGFUSE_SECRET_KEY=sk-lf-…

# LANGFUSE_HOST=https://cloud.langfuse.com (or your self-hosted URL)

# anthropic_api_key=sk-ant-…

# initialize clients

langfuse_client = Langfuse() # Reads keys from the environment automatically

anthropic_client = anthropic.anthropic() # Reads ANTHROPIC_API_KEY from environment

# ──Configuration ──────────────────────── ──────────────────────────

# Store your prompt here, not inline in the API call.

# This makes it freely versionable and testable.

system_prompt = “”“You are a helpful customer support assistant.

Answer questions clearly and concisely.

If you don’t know something, say it straight away – don’t guess.”“”

Sample = “cloud-sonnet-4-20250514”

# Pricing of Anthropic until mid-2026 (update if price changes)

# Used to calculate cost per call for cost tracking

COST_PER_INPUT_TOKEN = 3.00 / 1_000_000 # $3.00 per million input tokens

COST_PER_OUTPUT_TOKEN = 15.00 / 1_000_000 # $15.00 per million output tokens

def call_llm_with_tracing(

user_message: : STR,

session ID: : STR = “default-session”,

user ID: : STR = “Anonymous”

) -> STR: :

“”

Call a traced LLM. Each call creates a languagefuse trace:

– Full input and output

– Token usage (input, output, total)

– Cost calculated in USD

– latency in milliseconds

– Used model and session context

Parameters:

user_message : user’s message

session_id : Groups related calls into one conversation in Langfuse

user_id : Associates the call to a specific user for analytics

Return:

llm response as a string

“”

# Create a top-level trace for this user interaction

# The trace appears as a unit of work in the Langfuse dashboard

trace = langfuse_client.trace(

Name=“customer-support-call”,

session ID=session ID,

user ID=user ID,

input={“user_message”: : user_message, “system_prompt”: : system_prompt}

)

# Create a generation period inside the trace

# This captures model-specific details: model name, token, cost

generation = trace.generation(

Name=“Cloud-Closing”,

Sample=Sample,

input={

“System”: : system_prompt,

“Message”: : ({“Role”: : “user”, “Material”: : user_message})

}

)

start time = Time.Time()

Effort: :

# call api

feedback = anthropic_client.messages.create(

Sample=Sample,

max_tokens=1024,

System=system_prompt,

messages=({“Role”: : “user”, “Material”: : user_message})

)

latency_ms = int here((Time.Time() start time) * 1000)

# Remove response text

response_text = feedback.Material(0).Basic lesson

# Remove token usage from response

input_token = feedback.Use.input_token

output_token = feedback.Use.output_token

total_tokens = input_token + output_token

# Calculate the cost of this call

cost_USD = (

input_token* COST_PER_INPUT_TOKEN +

output_token* COST_PER_OUTPUT_token

)

# Update the generation period with the results

# This data populates the Langfuse cost and token dashboard

generation.Ending(

output=response_text,

Use={

“input”: : input_token,

“Output”: : output_token,

“Total”: : total_tokens,

“Unit”: : “token”

},

metadata={

“latency_ms”: : latency_ms,

“cost_usd”: : Round(cost_USD, 6),

“Sample”: : Sample

}

)

# Update the trace with the last output

trace.update(

output={“feedback”: : response_text},

metadata={“total_cost_USD”: : Round(cost_USD, 6)}

)

# Print a summary to stdout for local visibility

printing(F“n{‘─’ * 60}”)

printing(F“User:{user_message}”)

printing(F“Cloud: {response_text}”)

printing(F“Tokens: {input_tokens} in / {output_tokens} out / {total_tokens} total”)

printing(F“Cost: ${cost_usd:.6f}”)

printing(F“latency:{latency_ms}ms”)

printing(F“trace: {langfuse_client.base_url}/trace/{trace.id}”)

printing(F“{‘─’ * 60}n”)

return response_text

except Exception As E: :

# Record the error in the trace so that it appears in Langfuse

generation.Ending(

output=nobody,

metadata={“Mistake”: : STR(E), “latency_ms”: : int here((Time.Time() start time) * 1000)}

)

trace.update(output={“Mistake”: : STR(E)})

# Always flush before raising – this ensures the error trace is sent

langfuse_client.redness()

Raise

At the end: :

# flush sends all buffered events to langfuse

# In long running service, the Langfuse is automatically flushed.

# In a script, you must manually flush before the process ends.

langfuse_client.redness()

# ─ Run a demo

If __Name__ == “__main__”: :

# Simulate two stages of a customer support conversation

test message = (

“What is your return policy for electronics?”,

“Can I return an item purchased 45 days ago?”

)

Session = “demo-session-001”

For I, Message In tell one by one(test message): :

printing(F“ncall {i + 1}/{len(test_messages)}”)

Effort: :

call_llm_with_tracing(

user_message=Message,

session ID=Session,

user ID=“test-user-42”

)

except Exception As E: :

printing(F“Error on call {i + 1}: {e}”)

Related Articles

Leave a Comment