0
import subprocess, sys, os, shutil, glob
def pip_install(args):
subprocess.run((sys.executable, "-m", "pip", "install", "-q", *args),
check=True)
pip_install(("huggingface_hub>=0.26,<1.0"))
pip_install((
"-U",
"transformers>=4.49,<4.57",
"accelerate>=0.33.0",
"bitsandbytes>=0.43.0",
"peft>=0.11.0",
"datasets>=2.20.0,<3.0",
"sentence-transformers>=3.0.0,<4.0",
"faiss-cpu",
))
for p in glob.glob(os.path.expanduser(
"~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4*")):
shutil.rmtree(p, ignore_errors=True)
for _m in list(sys.modules):
if _m.startswith(("transformers", "huggingface_hub", "tokenizers",
"accelerate", "peft", "datasets",
"sentence_transformers")):
del sys.modules(_m)
import json, re, textwrap, warnings, torch
warnings.filterwarnings("ignore")
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TextStreamer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
)
import transformers
print(f"Using transformers {transformers.__version__}")
PHI_MODEL_ID = "microsoft/Phi-4-mini-instruct"
assert torch.cuda.is_available(), (
"No GPU detected. In Colab: Runtime > Change runtime type > T4 GPU."
)
print(f"GPU detected: {torch.cuda.get_device_name(0)}")
print(f"Loading Phi model (native phi3 arch, no remote code): {PHI_MODEL_ID}n")
bnb_cfg = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
phi_tokenizer = AutoTokenizer.from_pretrained(PHI_MODEL_ID)
if phi_tokenizer.pad_token_id is None:
phi_tokenizer.pad_token = phi_tokenizer.eos_token
phi_model = AutoModelForCausalLM.from_pretrained(
PHI_MODEL_ID,
quantization_config=bnb_cfg,
device_map="auto",
torch_dtype=torch.bfloat16,
)
phi_model.config.use_cache = True
print(f"n✓ Phi-4-mini loaded in 4-bit. "
f"GPU memory: {torch.cuda.memory_allocated()/1e9:.2f} GB")
print(f" Architecture: {phi_model.config.model_type} "
f"(using built-in {type(phi_model).__name__})")
print(f" Parameters: ~{sum(p.numel() for p in phi_model.parameters())/1e9:.2f}B")
def ask_phi(messages, *, tools=None, max_new_tokens=512,
temperature=0.3, stream=False):
"""Single entry point for all Phi-4-mini inference calls below."""
prompt_ids = phi_tokenizer.apply_chat_template(
messages,
tools=tools,
add_generation_prompt=True,
return_tensors="pt",
).to(phi_model.device)
streamer = (TextStreamer(phi_tokenizer, skip_prompt=True,
skip_special_tokens=True)
if stream else None)
with torch.inference_mode():
out = phi_model.generate(
prompt_ids,
max_new_tokens=max_new_tokens,
do_sample=temperature > 0,
temperature=max(temperature, 1e-5),
top_p=0.9,
pad_token_id=phi_tokenizer.pad_token_id,
eos_token_id=phi_tokenizer.eos_token_id,
streamer=streamer,
)
return phi_tokenizer.decode(
out(0)(prompt_ids.shape(1):), skip_special_tokens=True
).strip()
def banner(title):
print("n" + "=" * 78 + f"n {title}n" + "=" * 78)