1"""Ollama Cloud smart model picker — task-aware model selection. 2 3Why this exists 4--------------- 5Ollama Cloud hosts a wide menu of models with very different tradeoffs 6(latency, reasoning depth, JSON discipline, context length). Hard-coding 7``gpt-oss:20b`` everywhere wastes both signal quality (heavy synthesis on a 8small model) and budget (small jobs running on a 1T-param model). 9 10The picker maps a *task profile* — what the caller is trying to do — onto a 11concrete Ollama Cloud model. Callers ask for a profile, the picker returns 12the model. The factory only falls back to the picker when the arena config 13leaves the model unset (``model: auto`` or empty string). 14 15Profiles 16-------- 17``narrative`` Free-form analyst report, prose, low JSON discipline. 18``synthesis`` Heavy reasoning, weighs evidence, writes a verdict + score. 19``debate`` Multi-round argumentation; should be opinionated but bounded. 20``json_strict`` Caller expects a parseable JSON object. Pick a model with 21 reliable structured output. 22``classification`` Short label / signal extraction (BUY/HOLD/SELL etc.). 23``quick`` Fast triage, summaries, signal extraction. Latency dominates. 24``long_context`` Documents > 32K tokens to consider. 25``coding`` Code/SQL/regex generation. 26 27Defaults 28-------- 29The mapping below reflects the public Ollama Cloud catalog as of mid-2026. 30Models marked ``-cloud`` are the cloud-hosted variants. If a model is not 31available on a given account the caller can override via the arena's 32``llm.providers.ollama.model`` field — that pins the model and bypasses 33the picker entirely. 34""" 35 36from __future__ import annotations 37 38import logging 39import os 40from dataclasses import dataclass 41 42logger = logging.getLogger(__name__) 43 44 45# Task profile literal — kept open (str) so callers / configs can extend 46# without a code change. Unknown profiles fall through to ``DEFAULT_MODEL``. 47TaskProfile = str 48 49 50# Ollama Cloud catalog. Edit here when new models land. The values are the 51# model names the OpenAI-compatible endpoint expects ("model" field in the 52# chat-completions request body). 53# 54# We pick **non-reasoning** models for the quick/narrative profiles because 55# gpt-oss leaks chain-of-thought tokens into responses (the Ollama Cloud 56# OpenAI-compat shim returns them via message.reasoning, and our client 57# falls back to that when content is empty). For latency-sensitive 58# specialist analysts that need to emit clean structured JSON, that 59# reasoning leak is fatal — we'd rather pay the non-reasoning model's 60# slightly higher per-token cost and get parseable output. 61HEAVY_REASONING = "gpt-oss:120b" # synthesis / judge — reasoning OK 62GENERAL_INSTRUCT = "glm-4.7" # narrative / debate / quick — clean JSON 63LONG_CONTEXT = "qwen3-coder:480b" # 200K+ context, coding 64LARGEST = "deepseek-v3.1:671b" # heavy research 65QUICK = "glm-4.7" # fast structured outputs 66TRILLION = "kimi-k2:1t" # the big one 67 68 69DEFAULT_MODEL = GENERAL_INSTRUCT 70 71 72PROFILE_MAP: dict[TaskProfile, str] = { 73 "narrative": GENERAL_INSTRUCT, 74 "synthesis": HEAVY_REASONING, 75 "judge": HEAVY_REASONING, 76 "debate": GENERAL_INSTRUCT, 77 "json_strict": HEAVY_REASONING, 78 "classification": QUICK, 79 "quick": QUICK, 80 "signal": QUICK, 81 "long_context": LONG_CONTEXT, 82 "coding": LONG_CONTEXT, 83 "research": LARGEST, 84 "trillion": TRILLION, 85} 86 87 88# Backward-compat alias: external code (docs, tests) may still reference 89# GENERAL_REASONING. Point it at the new instruct model — the rename is 90# just to reflect that the quick path no longer goes through gpt-oss. 91GENERAL_REASONING = GENERAL_INSTRUCT 92 93 94# Tier → profile fallback. The arena config asks for a tier ("quick" / 95# "deep"); the picker maps that to a profile when no profile is explicit. 96TIER_PROFILE: dict[str, TaskProfile] = { 97 "quick": "quick", 98 "deep": "synthesis", 99} 100 101 102@dataclass(frozen=True) 103class PickContext: 104 """Inputs to the picker. 105 106 ``profile`` is the primary signal. ``tier`` is a fallback for callers 107 that only know the legacy quick/deep tier. ``role`` and ``arena_name`` 108 are optional refinements — e.g. judge agents always want heavy reasoning. 109 """ 110 111 profile: TaskProfile | None = None 112 tier: str | None = None 113 role: str | None = None 114 arena_name: str | None = None 115 116 117_ENV_OVERRIDE_PREFIX = "MAF_OLLAMA_MODEL__" 118_USE_RANKINGS_ENV = "MAF_USE_RANKINGS" 119 120 121def _env_override(profile: TaskProfile) -> str | None: 122 """Allow ad-hoc overrides without editing this file. 123 124 ``MAF_OLLAMA_MODEL__SYNTHESIS=gpt-oss:120b-cloud`` overrides the synthesis 125 profile. Used by ops to pin a specific model for an experiment. 126 """ 127 key = _ENV_OVERRIDE_PREFIX + profile.upper() 128 val = os.environ.get(key) 129 return val.strip() if val else None 130 131 132def _rankings_pick_sync(profile: TaskProfile) -> str | None: 133 """Try to use OpenRouter rankings to pick a model. 134 135 Synchronous wrapper that reads from the rankings cache *if* it already 136 has data for the matching category. Does not trigger a network fetch — 137 the dashboard / a startup hook is expected to populate the cache. 138 139 Skipped entirely unless ``MAF_USE_RANKINGS=1`` is set, so the default 140 flow is the static PROFILE_MAP (predictable, fast, offline-safe). 141 """ 142 if os.environ.get(_USE_RANKINGS_ENV, "").lower() not in ("1", "true", "yes"): 143 return None 144 try: 145 from maf.llm.openrouter_rankings import ( 146 DEFAULT_CATEGORY, 147 PROFILE_TO_CATEGORY, 148 best_ollama_for_profile, 149 get_cache, 150 ) 151 except ImportError: 152 return None 153 category = PROFILE_TO_CATEGORY.get(profile, DEFAULT_CATEGORY) 154 cache = get_cache() 155 cached = cache.peek(category) 156 if not cached: 157 return None 158 pick, _dim = best_ollama_for_profile(cached, profile) 159 return pick 160 161 162def pick_model(ctx: PickContext) -> str: 163 """Return the Ollama Cloud model name for ``ctx``. 164 165 Precedence: 166 1. ``MAF_OLLAMA_MODEL__<PROFILE>`` env override 167 2. role-specific shortcut (``judge`` / ``synthesis`` → heavy reasoning) 168 3. ``PROFILE_MAP[profile]`` 169 4. ``PROFILE_MAP[TIER_PROFILE[tier]]`` 170 5. ``DEFAULT_MODEL`` 171 """ 172 # Role shortcuts run before the profile lookup so that explicit 173 # role='judge' always wins over a tier='quick' fallback. 174 if ctx.role in ("judge", "synthesis"): 175 env = _env_override("synthesis") 176 if env: 177 return env 178 rankings = _rankings_pick_sync("synthesis") 179 if rankings: 180 return rankings 181 return HEAVY_REASONING 182 183 if ctx.profile: 184 env = _env_override(ctx.profile) 185 if env: 186 return env 187 rankings = _rankings_pick_sync(ctx.profile) 188 if rankings: 189 return rankings 190 if ctx.profile in PROFILE_MAP: 191 return PROFILE_MAP[ctx.profile] 192 193 if ctx.tier: 194 profile = TIER_PROFILE.get(ctx.tier) 195 if profile: 196 env = _env_override(profile) 197 if env: 198 return env 199 rankings = _rankings_pick_sync(profile) 200 if rankings: 201 return rankings 202 return PROFILE_MAP[profile] 203 204 return DEFAULT_MODEL 205 206 207def is_auto(model_value: str | None) -> bool: 208 """Whether the configured model value should defer to the picker. 209 210 Accepts ``"auto"``, ``""``, ``None``. Case-insensitive on ``auto``. 211 """ 212 if model_value is None: 213 return True 214 s = model_value.strip().lower() 215 return s == "" or s == "auto" 216 217 218__all__ = [ 219 "DEFAULT_MODEL", 220 "GENERAL_REASONING", 221 "HEAVY_REASONING", 222 "LARGEST", 223 "LONG_CONTEXT", 224 "PROFILE_MAP", 225 "PickContext", 226 "QUICK", 227 "TIER_PROFILE", 228 "TRILLION", 229 "TaskProfile", 230 "is_auto", 231 "pick_model", 232]