Recipe: LLM eval harness setup
A reproducible pipeline for scoring model outputs against golden datasets.
Prerequisites
- Python 3.11+ with venv
- Golden dataset in JSONL (prompt, reference, metadata)
- Model endpoint (OpenAI-compatible or local vLLM)
Step 1 — scaffold
eval_harness/ ├── run.py ├── scorer.py ├── prompts/ │ └── system.txt ├── data/ │ └── golden.jsonl └── results/
Step 2 — scorer
# scorer.py
import json
from difflib import SequenceMatcher
def exact_match(ref: str, hyp: str) -> float:
return 1.0 if ref.strip() == hyp.strip() else 0.0
def fuzzy_match(ref: str, hyp: str) -> float:
return SequenceMatcher(None, ref, hyp).ratio()
METRICS = {"exact": exact_match, "fuzzy": fuzzy_match}
def score(ref: str, hyp: str) -> dict:
return {k: fn(ref, hyp) for k, fn in METRICS.items()}Step 3 — runner
# run.py
import json, time, requests
from scorer import score
with open("data/golden.jsonl") as f:
rows = [json.loads(l) for l in f]
results = []
for r in rows:
t0 = time.time()
resp = requests.post("http://localhost:8000/v1/chat/completions",
json={"model":"meridian-7b","messages":[
{"role":"user","content":r["prompt"]}]})
hyp = resp.json()["choices"][0]["message"]["content"]
lat = time.time() - t0
results.append({**r, "hypothesis": hyp,
"scores": score(r["reference"], hyp), "latency_s": lat})
with open("results/run_1.json","w") as f:
json.dump(results, f, indent=2)Step 4 — aggregate
python -c "
import json
with open('results/run_1.json') as f:
data = json.load(f)
exact = sum(d['scores']['exact'] for d in data) / len(data)
fuzzy = sum(d['scores']['fuzzy'] for d in data) / len(data)
print(f'exact={exact:.3f} fuzzy={fuzzy:.3f}')
"Extend METRICS with BLEU, ROUGE, or LLM-as-judge. Swap the endpoint URL for any OpenAI-compatible provider.