checking system…
Docs / back / src/maf/llm/model_picker.py · line 162
Python · 233 lines
  1"""Ollama Cloud smart model picker — task-aware model selection.
  2
  3Why this exists
  4---------------
  5Ollama Cloud hosts a wide menu of models with very different tradeoffs
  6(latency, reasoning depth, JSON discipline, context length). Hard-coding
  7``gpt-oss:20b`` everywhere wastes both signal quality (heavy synthesis on a
  8small model) and budget (small jobs running on a 1T-param model).
  9
 10The picker maps a *task profile* — what the caller is trying to do — onto a
 11concrete Ollama Cloud model. Callers ask for a profile, the picker returns
 12the model. The factory only falls back to the picker when the arena config
 13leaves the model unset (``model: auto`` or empty string).
 14
 15Profiles
 16--------
 17``narrative``    Free-form analyst report, prose, low JSON discipline.
 18``synthesis``    Heavy reasoning, weighs evidence, writes a verdict + score.
 19``debate``       Multi-round argumentation; should be opinionated but bounded.
 20``json_strict``  Caller expects a parseable JSON object. Pick a model with
 21                 reliable structured output.
 22``classification`` Short label / signal extraction (BUY/HOLD/SELL etc.).
 23``quick``        Fast triage, summaries, signal extraction. Latency dominates.
 24``long_context`` Documents > 32K tokens to consider.
 25``coding``       Code/SQL/regex generation.
 26
 27Defaults
 28--------
 29The mapping below reflects the public Ollama Cloud catalog as of mid-2026.
 30Models marked ``-cloud`` are the cloud-hosted variants. If a model is not
 31available on a given account the caller can override via the arena's
 32``llm.providers.ollama.model`` field — that pins the model and bypasses
 33the picker entirely.
 34"""
 35
 36from __future__ import annotations
 37
 38import logging
 39import os
 40from dataclasses import dataclass
 41
 42logger = logging.getLogger(__name__)
 43
 44
 45# Task profile literal — kept open (str) so callers / configs can extend
 46# without a code change. Unknown profiles fall through to ``DEFAULT_MODEL``.
 47TaskProfile = str
 48
 49
 50# Ollama Cloud catalog. Edit here when new models land. The values are the
 51# model names the OpenAI-compatible endpoint expects ("model" field in the
 52# chat-completions request body).
 53#
 54# We pick **non-reasoning** models for the quick/narrative profiles because
 55# gpt-oss leaks chain-of-thought tokens into responses (the Ollama Cloud
 56# OpenAI-compat shim returns them via message.reasoning, and our client
 57# falls back to that when content is empty). For latency-sensitive
 58# specialist analysts that need to emit clean structured JSON, that
 59# reasoning leak is fatal — we'd rather pay the non-reasoning model's
 60# slightly higher per-token cost and get parseable output.
 61HEAVY_REASONING = "gpt-oss:120b"             # synthesis / judge — reasoning OK
 62GENERAL_INSTRUCT = "glm-4.7"                  # narrative / debate / quick — clean JSON
 63LONG_CONTEXT = "qwen3-coder:480b"             # 200K+ context, coding
 64LARGEST = "deepseek-v3.1:671b"                # heavy research
 65QUICK = "glm-4.7"                             # fast structured outputs
 66TRILLION = "kimi-k2:1t"                       # the big one
 67
 68
 69DEFAULT_MODEL = GENERAL_INSTRUCT
 70
 71
 72PROFILE_MAP: dict[TaskProfile, str] = {
 73    "narrative":      GENERAL_INSTRUCT,
 74    "synthesis":      HEAVY_REASONING,
 75    "judge":          HEAVY_REASONING,
 76    "debate":         GENERAL_INSTRUCT,
 77    "json_strict":    HEAVY_REASONING,
 78    "classification": QUICK,
 79    "quick":          QUICK,
 80    "signal":         QUICK,
 81    "long_context":   LONG_CONTEXT,
 82    "coding":         LONG_CONTEXT,
 83    "research":       LARGEST,
 84    "trillion":       TRILLION,
 85}
 86
 87
 88# Backward-compat alias: external code (docs, tests) may still reference
 89# GENERAL_REASONING. Point it at the new instruct model — the rename is
 90# just to reflect that the quick path no longer goes through gpt-oss.
 91GENERAL_REASONING = GENERAL_INSTRUCT
 92
 93
 94# Tier → profile fallback. The arena config asks for a tier ("quick" /
 95# "deep"); the picker maps that to a profile when no profile is explicit.
 96TIER_PROFILE: dict[str, TaskProfile] = {
 97    "quick": "quick",
 98    "deep":  "synthesis",
 99}
100
101
102@dataclass(frozen=True)
103class PickContext:
104    """Inputs to the picker.
105
106    ``profile`` is the primary signal. ``tier`` is a fallback for callers
107    that only know the legacy quick/deep tier. ``role`` and ``arena_name``
108    are optional refinements — e.g. judge agents always want heavy reasoning.
109    """
110
111    profile: TaskProfile | None = None
112    tier: str | None = None
113    role: str | None = None
114    arena_name: str | None = None
115
116
117_ENV_OVERRIDE_PREFIX = "MAF_OLLAMA_MODEL__"
118_USE_RANKINGS_ENV = "MAF_USE_RANKINGS"
119
120
121def _env_override(profile: TaskProfile) -> str | None:
122    """Allow ad-hoc overrides without editing this file.
123
124    ``MAF_OLLAMA_MODEL__SYNTHESIS=gpt-oss:120b-cloud`` overrides the synthesis
125    profile. Used by ops to pin a specific model for an experiment.
126    """
127    key = _ENV_OVERRIDE_PREFIX + profile.upper()
128    val = os.environ.get(key)
129    return val.strip() if val else None
130
131
132def _rankings_pick_sync(profile: TaskProfile) -> str | None:
133    """Try to use OpenRouter rankings to pick a model.
134
135    Synchronous wrapper that reads from the rankings cache *if* it already
136    has data for the matching category. Does not trigger a network fetch —
137    the dashboard / a startup hook is expected to populate the cache.
138
139    Skipped entirely unless ``MAF_USE_RANKINGS=1`` is set, so the default
140    flow is the static PROFILE_MAP (predictable, fast, offline-safe).
141    """
142    if os.environ.get(_USE_RANKINGS_ENV, "").lower() not in ("1", "true", "yes"):
143        return None
144    try:
145        from maf.llm.openrouter_rankings import (
146            DEFAULT_CATEGORY,
147            PROFILE_TO_CATEGORY,
148            best_ollama_for_profile,
149            get_cache,
150        )
151    except ImportError:
152        return None
153    category = PROFILE_TO_CATEGORY.get(profile, DEFAULT_CATEGORY)
154    cache = get_cache()
155    cached = cache.peek(category)
156    if not cached:
157        return None
158    pick, _dim = best_ollama_for_profile(cached, profile)
159    return pick
160
161
162def pick_model(ctx: PickContext) -> str:
163    """Return the Ollama Cloud model name for ``ctx``.
164
165    Precedence:
166      1. ``MAF_OLLAMA_MODEL__<PROFILE>`` env override
167      2. role-specific shortcut (``judge`` / ``synthesis`` → heavy reasoning)
168      3. ``PROFILE_MAP[profile]``
169      4. ``PROFILE_MAP[TIER_PROFILE[tier]]``
170      5. ``DEFAULT_MODEL``
171    """
172    # Role shortcuts run before the profile lookup so that explicit
173    # role='judge' always wins over a tier='quick' fallback.
174    if ctx.role in ("judge", "synthesis"):
175        env = _env_override("synthesis")
176        if env:
177            return env
178        rankings = _rankings_pick_sync("synthesis")
179        if rankings:
180            return rankings
181        return HEAVY_REASONING
182
183    if ctx.profile:
184        env = _env_override(ctx.profile)
185        if env:
186            return env
187        rankings = _rankings_pick_sync(ctx.profile)
188        if rankings:
189            return rankings
190        if ctx.profile in PROFILE_MAP:
191            return PROFILE_MAP[ctx.profile]
192
193    if ctx.tier:
194        profile = TIER_PROFILE.get(ctx.tier)
195        if profile:
196            env = _env_override(profile)
197            if env:
198                return env
199            rankings = _rankings_pick_sync(profile)
200            if rankings:
201                return rankings
202            return PROFILE_MAP[profile]
203
204    return DEFAULT_MODEL
205
206
207def is_auto(model_value: str | None) -> bool:
208    """Whether the configured model value should defer to the picker.
209
210    Accepts ``"auto"``, ``""``, ``None``. Case-insensitive on ``auto``.
211    """
212    if model_value is None:
213        return True
214    s = model_value.strip().lower()
215    return s == "" or s == "auto"
216
217
218__all__ = [
219    "DEFAULT_MODEL",
220    "GENERAL_REASONING",
221    "HEAVY_REASONING",
222    "LARGEST",
223    "LONG_CONTEXT",
224    "PROFILE_MAP",
225    "PickContext",
226    "QUICK",
227    "TIER_PROFILE",
228    "TRILLION",
229    "TaskProfile",
230    "is_auto",
231    "pick_model",
232]