Design a complete multimodal RLVR pipeline with OpenMM-RL, vision-language prompting, reward scoring, and GRPO export

by ai-intensify
0 comments
Design a complete multimodal RLVR pipeline with OpenMM-RL, vision-language prompting, reward scoring, and GRPO export

EXTRACT_PATS = (
   r"\boxed{((^{})+)}",
   r"finals+answers*(:=)s*((^n)+)",
   r"answers*(:=)s*((^n)+)",
)
def extract_final(text):
   if not text: return ""
   for p in EXTRACT_PATS:
       m = re.search(p, text, flags=re.IGNORECASE)
       if m: return m.group(1).strip().strip(".,;")
   lines = (l.strip() for l in str(text).strip().splitlines() if l.strip())
   return lines(-1) if lines else ""
def latex_to_sympy(s):
   s = (s or "").strip().strip("$").strip()
   s = re.sub(r"^\((()", "", s); s = re.sub(r"\()))$", "", s)
   s = (s.replace("\pi", "pi").replace("\cdot", "*").replace("\times", "*")
          .replace("\,", "").replace("\;", "").replace("\!", ""))
   s = re.sub(r"\fracs*{((^{})+)}s*{((^{})+)}", r"((1)/(2))", s)
   s = re.sub(r"\sqrts*{((^{})+)}", r"sqrt(1)", s)
   s = s.replace("^", "**")
   s = re.sub(r"\(a-zA-Z)+", "", s)
   s = s.replace("{", "(").replace("}", ")")
   return s
def grade(pred, gold, tol=1e-4):
   """Verifiable reward in (0,1): exact > numeric > sympy-symbolic > partial."""
   if pred is None or gold is None: return 0.0
   p = extract_final(str(pred)).strip()
   g = str(gold).strip()
   norm = lambda x: re.sub(r"s+", "", x.lower()).strip("$.,;()()")
   if norm(p) == norm(g): return 1.0
   def to_float(x):
       try: return float(latex_to_sympy(x))
       except Exception:
           try: return float(sp.sympify(latex_to_sympy(x)).evalf())
           except Exception: return None
   fp, fg = to_float(p), to_float(g)
   if fp is not None and fg is not None:
       if abs(fp - fg) / max(1.0, abs(fg)) < tol: return 1.0
   try:
       ep = sp.sympify(latex_to_sympy(p)); eg = sp.sympify(latex_to_sympy(g))
       if sp.simplify(ep - eg) == 0: return 1.0
   except Exception:
       pass
   if norm(g) and norm(g) in norm(p): return 0.5
   return 0.0
print("n=== Grader sanity checks ===")
for pred, gold, want in (
   ("The answer is \boxed{120}",            "(120)",            1.0),
   ("After computing: 7396 \pi",            "7396\pi",         1.0),
   ("Final answer: -71/4",                   "-\frac{71}{4}",   1.0),
   ("Therefore the result is 0.0074",        "0.0074",           1.0),
   ("Final answer: nucleus accumbens",       "Nucleus accumbens",1.0),
   ("I don't know",                          "12",               0.0),
):
   print(f"  pred={pred(:38)!r:42s} gold={gold!r:22s} -> r={grade(pred, gold)}  (want {want})")
SYSTEM = ("You are a STEM expert solving multimodal reasoning problems. "
         "You will see a question and one or more figures. "
         "Reason step by step, then end with exactly one line:n"
         "Final answer: ")
def build_prompt(ex):
   img_tags = "n".join(f"(Image {i+1})" for i in range(len(ex("images"))))
   return f"{SYSTEM}nn{img_tags}nnQuestion:n{ex('question')}nnLet's think step by step."
print("n=== Example prompt (truncated) ===")
print(build_prompt(ds(0))(:600), "...n")

Related Articles

Leave a Comment