diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index cdedc9158..1760a7fb2 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -62,8 +62,7 @@ jobs: run: nohup ollama serve & - name: Pull models run: | - ollama pull granite4:micro - ollama pull granite4:micro-h + ollama pull granite4.1:3b - name: Run Tests id: tests run: uv run -m pytest -v --junit-xml=/tmp/pytest-results.xml test diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d8bad08b9..7398a2bbf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -373,8 +373,7 @@ models must be pulled locally before running the tests that need them. **CI (unit + integration tests):** -- `granite4:micro` — default model for `start_session()` and most examples -- `granite4:micro-h` — hybrid variant used by conftest fixtures +- `granite4.1:3b` — default model for `start_session()` and most examples **Examples (`docs/examples/`):** @@ -399,7 +398,7 @@ models must be pulled locally before running the tests that need them. Pull everything: ```bash -for m in granite4:micro granite4:micro-h deepseek-r1:8b \ +for m in granite4.1:3b deepseek-r1:8b \ granite3-guardian:2b granite3.2-vision granite3.3:8b granite4:latest \ llama3.2 llama3.2:3b \ qwen2.5vl:7b granite4:small-h llama3.2:1b llama3:8b llava mistral:7b \ diff --git a/cli/alora/README_TEMPLATE.jinja b/cli/alora/README_TEMPLATE.jinja index 9741006d0..d2a01c5bd 100644 --- a/cli/alora/README_TEMPLATE.jinja +++ b/cli/alora/README_TEMPLATE.jinja @@ -85,9 +85,9 @@ def {{ intrinsic_name }}({{ arglist }}, ctx: Context, backend: Backend | Adapter if __name__ == "__main__": from mellea.backends.huggingface import LocalHFBackend - from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B + from mellea.backends.model_ids import IBM_GRANITE_4_1_3B from mellea.stdlib.context import ChatContext - backend = LocalHFBackend(IBM_GRANITE_4_MICRO_3B) + backend = LocalHFBackend(IBM_GRANITE_4_1_3B) result, ctx = {{ intrinsic_name }}({{ example_call_kwargs }}, ctx=ChatContext(), backend=backend) print(result.value) ``` diff --git a/cli/eval/runner.py b/cli/eval/runner.py index 58c1ea4df..8bdc6682b 100644 --- a/cli/eval/runner.py +++ b/cli/eval/runner.py @@ -150,7 +150,7 @@ def create_session( else: model_id = model else: - model_id = mellea.model_ids.IBM_GRANITE_4_MICRO_3B + model_id = mellea.model_ids.IBM_GRANITE_4_1_3B try: backend_lower = backend.lower() diff --git a/docs/alora.md b/docs/alora.md index 8a52cd1a8..647de34ee 100644 --- a/docs/alora.md +++ b/docs/alora.md @@ -37,7 +37,7 @@ Use the `m alora train` command to fine-tune a LoRA or aLoRA adapter requirement ```bash m alora train path/to/data.jsonl \ - --basemodel ibm-granite/granite-4.0-micro \ + --basemodel ibm-granite/granite-4.1-3b \ --outfile ./checkpoints/alora_adapter \ --adapter alora \ --device auto \ @@ -48,7 +48,7 @@ m alora train path/to/data.jsonl \ --grad-accum 4 ``` -> **Note on Model Selection**: Only non-hybrid models (e.g., `granite-4.0-micro`) are +> **Note on Model Selection**: Only non-hybrid models (e.g., `granite-4.1-3b`) are > currently supported for LoRA or aLoRA training. > Mamba/Transformers hybrid models like `granite-4.0-h-micro` will produce low-quality > results with Mellea's current hard-coded settings for parameter-efficient fine tuning. diff --git a/docs/docs/concepts/context-and-sessions.md b/docs/docs/concepts/context-and-sessions.md index f66559e0a..b4f65895a 100644 --- a/docs/docs/concepts/context-and-sessions.md +++ b/docs/docs/concepts/context-and-sessions.md @@ -129,7 +129,7 @@ from mellea.backends.ollama import OllamaModelBackend from mellea.stdlib.context import SimpleContext backend = OllamaModelBackend( - "granite4:micro", + "granite4.1:3b", model_options={"temperature": 0.2}, ) m = MelleaSession(backend, SimpleContext()) diff --git a/docs/docs/examples/data-extraction-pipeline.md b/docs/docs/examples/data-extraction-pipeline.md index cffbc0db2..89870ee3e 100644 --- a/docs/docs/examples/data-extraction-pipeline.md +++ b/docs/docs/examples/data-extraction-pipeline.md @@ -20,7 +20,7 @@ runtime exactly what shape the result must have. ## Prerequisites - [Quick Start](../getting-started/quickstart) complete -- Ollama running locally with `granite4:micro` pulled +- Ollama running locally with `granite4.1:3b` pulled ## The full example diff --git a/docs/docs/examples/index.md b/docs/docs/examples/index.md index 87c5e0f70..3162a0adf 100644 --- a/docs/docs/examples/index.md +++ b/docs/docs/examples/index.md @@ -128,4 +128,4 @@ uv run docs/examples//.py **Default backend:** `start_session()` with no arguments connects to a local [Ollama](https://ollama.ai) instance running **IBM Granite 4 Micro** -(`granite4:micro`). Make sure Ollama is running before you execute any example. +(`granite4.1:3b`). Make sure Ollama is running before you execute any example. diff --git a/docs/docs/examples/legacy-code-integration.md b/docs/docs/examples/legacy-code-integration.md index 1fadc558c..9b4fb4b15 100644 --- a/docs/docs/examples/legacy-code-integration.md +++ b/docs/docs/examples/legacy-code-integration.md @@ -24,7 +24,7 @@ class or instance so you can pass it directly to session methods like `m.act()`, - [Quick Start](../getting-started/quickstart) complete - [MObjects and mify](../concepts/mobjects-and-mify) concept page (recommended background) -- Ollama running locally with `granite4:micro` pulled +- Ollama running locally with `granite4.1:3b` pulled ## The full example diff --git a/docs/docs/examples/resilient-rag-fallback.md b/docs/docs/examples/resilient-rag-fallback.md index d8c28ac45..c942d89d9 100644 --- a/docs/docs/examples/resilient-rag-fallback.md +++ b/docs/docs/examples/resilient-rag-fallback.md @@ -23,7 +23,7 @@ the survivors to a grounded `m.instruct()` call. - [Quick Start](../getting-started/quickstart) complete - `faiss-cpu` and `sentence-transformers` installed, **or** run via `uv run` which installs them automatically from the inline script block -- Ollama running locally with `granite4:micro` pulled (or a Mistral model — see +- Ollama running locally with `granite4.1:3b` pulled (or a Mistral model — see the session setup section below) Install dependencies manually if you are not using `uv run`: diff --git a/docs/docs/examples/traced-generation-loop.md b/docs/docs/examples/traced-generation-loop.md index e54914c02..d2d9c1970 100644 --- a/docs/docs/examples/traced-generation-loop.md +++ b/docs/docs/examples/traced-generation-loop.md @@ -25,7 +25,7 @@ calls. ## Prerequisites - [Quick Start](../getting-started/quickstart) complete -- Ollama running locally with `granite4:micro` pulled +- Ollama running locally with `granite4.1:3b` pulled - (Optional) [Jaeger](https://www.jaegertracing.io/) running locally for span visualisation — see the Jaeger section below diff --git a/docs/docs/getting-started/installation.md b/docs/docs/getting-started/installation.md index bab02e097..8166c47c4 100644 --- a/docs/docs/getting-started/installation.md +++ b/docs/docs/getting-started/installation.md @@ -57,5 +57,5 @@ The default session connects to [Ollama](https://ollama.ai) running locally. Install Ollama and pull the default model before running any examples: ```bash -ollama pull granite4:micro +ollama pull granite4.1:3b ``` diff --git a/docs/docs/getting-started/quickstart.md b/docs/docs/getting-started/quickstart.md index db591f246..1e9f3dbd7 100644 --- a/docs/docs/getting-started/quickstart.md +++ b/docs/docs/getting-started/quickstart.md @@ -10,7 +10,7 @@ description: "Run your first generative program in minutes." ## Hello world By default, `start_session()` connects to Ollama and uses **IBM Granite 4 Micro** -(`granite4:micro`). Make sure Ollama is running before you run this: +(`granite4.1:3b`). Make sure Ollama is running before you run this: ```python import mellea @@ -191,7 +191,7 @@ HuggingFace, and WatsonX are also supported. See ## Troubleshooting -**`granite4:micro` not found** — run `ollama pull granite4:micro` before starting. +**`granite4.1:3b` not found** — run `ollama pull granite4.1:3b` before starting. **Python 3.13 `outlines` install failure** — `outlines` requires a Rust compiler. Either install [Rust](https://www.rust-lang.org/tools/install) or pin Python to 3.12. diff --git a/docs/docs/guide/CONTRIBUTING.md b/docs/docs/guide/CONTRIBUTING.md index cdbb67376..f6e95841c 100644 --- a/docs/docs/guide/CONTRIBUTING.md +++ b/docs/docs/guide/CONTRIBUTING.md @@ -140,7 +140,7 @@ Or a section-level callout if multiple blocks share the caveat: All code — fenced blocks AND inline backtick references — must match current source: - Import paths, class names, method names exact. -- Model IDs current (e.g., `ibm-granite/granite-4.0-micro`). +- Model IDs current (e.g., `ibm-granite/granite-4.1-3b`). - Inline prose fragments consistent with adjacent code blocks. If the source itself has inconsistencies, document as-is and note in the glossary. diff --git a/docs/docs/how-to/backends-and-configuration.md b/docs/docs/how-to/backends-and-configuration.md index 09f798302..8113d3e02 100644 --- a/docs/docs/how-to/backends-and-configuration.md +++ b/docs/docs/how-to/backends-and-configuration.md @@ -13,7 +13,7 @@ configure the backend when you create a session. ## Default backend -`start_session()` defaults to **Ollama** with **IBM Granite 4 Micro** (`granite4:micro`). +`start_session()` defaults to **Ollama** with **IBM Granite 4 Micro** (`granite4.1:3b`). No API keys needed — just have Ollama running: ```python @@ -142,7 +142,7 @@ Run models locally using HuggingFace transformers: from mellea import MelleaSession from mellea.backends.huggingface import LocalHFBackend -backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") +backend = LocalHFBackend(model_id="ibm-granite/granite-4.1-3b") m = MelleaSession(backend=backend) ``` diff --git a/docs/docs/how-to/handling-exceptions.md b/docs/docs/how-to/handling-exceptions.md index 7f18adb08..813d1470a 100644 --- a/docs/docs/how-to/handling-exceptions.md +++ b/docs/docs/how-to/handling-exceptions.md @@ -262,7 +262,7 @@ from mellea.backends import model_ids from mellea.stdlib.sampling import RejectionSamplingStrategy def instruct_with_fallback(text: str) -> str: - m_fast = MelleaSession(OllamaModelBackend(model_ids.IBM_GRANITE_4_MICRO_3B)) + m_fast = MelleaSession(OllamaModelBackend(model_ids.IBM_GRANITE_4_1_3B)) result = m_fast.instruct( text, strategy=RejectionSamplingStrategy(loop_budget=3), diff --git a/docs/docs/how-to/m-decompose.md b/docs/docs/how-to/m-decompose.md index 056bdbb40..3074145ae 100644 --- a/docs/docs/how-to/m-decompose.md +++ b/docs/docs/how-to/m-decompose.md @@ -25,7 +25,7 @@ m decompose run --input-file task.txt --out-dir ./output/ > **Note:** The output directory must already exist — the command will error if it > does not. On first run with Ollama, the default model will be downloaded > automatically (~15 GB for the full model). Use `--model-id` with a smaller model -> (e.g. `granite4:micro`) to avoid the large download. +> (e.g. `granite4.1:3b`) to avoid the large download. This produces a subdirectory under `./output/` (one per task job): @@ -59,7 +59,7 @@ python output/m_decomp_result/m_decomp_result.py ## Backend options -`m decompose` defaults to Ollama with `granite4:micro`. Pass `--backend` and +`m decompose` defaults to Ollama with `granite4.1:3b`. Pass `--backend` and `--model-id` to use a different inference engine: ```bash @@ -86,7 +86,7 @@ from cli.decompose.pipeline import DecompBackend, decompose result = decompose( task_prompt="Write a short blog post about morning exercise.", - model_id="granite4:micro", + model_id="granite4.1:3b", backend=DecompBackend.ollama, ) diff --git a/docs/docs/how-to/unit-test-generative-code.md b/docs/docs/how-to/unit-test-generative-code.md index 3e2ead051..12769949a 100644 --- a/docs/docs/how-to/unit-test-generative-code.md +++ b/docs/docs/how-to/unit-test-generative-code.md @@ -45,7 +45,7 @@ import pytest from mellea import MelleaSession from mellea.backends.ollama import OllamaModelBackend -_MODEL_ID = "granite4:micro" +_MODEL_ID = "granite4.1:3b" @pytest.fixture(scope="module") @@ -358,8 +358,8 @@ from mellea.stdlib.components.unit_test_eval import TestBasedEval test_evals = TestBasedEval.from_json_file("tests/eval_data/email_writer.json") -judge_session = start_session(backend_name="ollama", model_id="granite4:micro") -generation_session = start_session(backend_name="ollama", model_id="granite4:micro") +judge_session = start_session(backend_name="ollama", model_id="granite4.1:3b") +generation_session = start_session(backend_name="ollama", model_id="granite4.1:3b") for eval_case in test_evals: for idx, input_text in enumerate(eval_case.inputs): @@ -380,7 +380,7 @@ for eval_case in test_evals: > **Note:** `TestBasedEval` calls the judge model once per input. For large > evaluation sets, consider batching or running evaluations asynchronously. > **CLI alternative:** The same evaluation can be run without writing Python: -> `m eval run tests/eval_data/email_writer.json --backend ollama --model granite4:micro` +> `m eval run tests/eval_data/email_writer.json --backend ollama --model granite4.1:3b` > See `m eval run --help` for full options. ## CI strategy diff --git a/docs/docs/how-to/use-images-and-vision.md b/docs/docs/how-to/use-images-and-vision.md index f63e96992..00697bad2 100644 --- a/docs/docs/how-to/use-images-and-vision.md +++ b/docs/docs/how-to/use-images-and-vision.md @@ -10,7 +10,7 @@ Mellea supports multimodal input: pass images alongside your text prompt to any **Prerequisites:** `pip install mellea pillow`, a vision-capable model downloaded and running. -> **Backend note:** The default Ollama model (`granite4:micro`) does not support image +> **Backend note:** The default Ollama model (`granite4.1:3b`) does not support image > input. You must switch to a vision-capable model such as `granite3.2-vision` or > `llava`. Not all backends support vision — see backend notes below. diff --git a/docs/docs/integrations/langchain.md b/docs/docs/integrations/langchain.md index a15ea4542..d1617499c 100644 --- a/docs/docs/integrations/langchain.md +++ b/docs/docs/integrations/langchain.md @@ -53,7 +53,7 @@ instance, so any tool that follows the LangChain `BaseTool` interface works with further configuration. > **Backend note:** Tool calling requires a backend and model that support function -> calling (e.g., Ollama with `granite4:micro`, OpenAI with `gpt-4o`). The default +> calling (e.g., Ollama with `granite4.1:3b`, OpenAI with `gpt-4o`). The default > Ollama setup supports this. ## Seeding a session with LangChain message history diff --git a/docs/docs/integrations/ollama.md b/docs/docs/integrations/ollama.md index 8f8dd8a9f..af34af099 100644 --- a/docs/docs/integrations/ollama.md +++ b/docs/docs/integrations/ollama.md @@ -34,7 +34,7 @@ background service. ## Default setup `start_session()` connects to Ollama on `localhost:11434` and uses -**IBM Granite 4 Micro** (`granite4:micro`) by default. On first run, Mellea +**IBM Granite 4 Micro** (`granite4.1:3b`) by default. On first run, Mellea automatically pulls the model if it is not already downloaded: ```python @@ -47,7 +47,7 @@ print(str(email)) # Output will vary — LLM responses depend on model and temperature. ``` -> **Note:** The first run pulls `granite4:micro` (~2 GB). Subsequent runs start +> **Note:** The first run pulls `granite4.1:3b` (~2 GB). Subsequent runs start > immediately from the local cache. ## Switching models @@ -75,7 +75,7 @@ m = start_session(model_id=model_ids.IBM_GRANITE_3_3_8B) Pull models before using them (or let Mellea pull on first use): ```bash -ollama pull granite4:micro +ollama pull granite4.1:3b ollama pull llama3.2:3b ollama pull mistral:7b ``` @@ -84,8 +84,8 @@ ollama pull mistral:7b | `model_ids` constant | Ollama name | Notes | | -------------------- | ----------- | ----- | -| `IBM_GRANITE_4_MICRO_3B` | `granite4:micro` | Default. Fast, low memory (~2 GB). | -| `IBM_GRANITE_4_HYBRID_MICRO` | `granite4:micro-h` | Hybrid variant with extended thinking. | +| `IBM_GRANITE_4_1_3B` | `granite4.1:3b` | Default. Fast, low memory (~2 GB). | +| `IBM_GRANITE_4_1_8B` | `granite4.1:8b` | Higher quality, ~5 GB. | | `IBM_GRANITE_3_3_8B` | `granite3.3:8b` | Higher quality, ~5 GB. | | `IBM_GRANITE_3_3_VISION_2B` | `ibm/granite3.3-vision:2b` | Vision model for image inputs. | | `META_LLAMA_3_2_3B` | `llama3.2:3b` | Compact Llama model. | @@ -131,7 +131,7 @@ from mellea.backends.ollama import OllamaModelBackend m = MelleaSession( OllamaModelBackend( - model_id="granite4:micro", + model_id="granite4.1:3b", base_url="http://my-gpu-server:11434", ) ) @@ -152,7 +152,7 @@ from mellea.backends.ollama import OllamaModelBackend m = MelleaSession( OllamaModelBackend( - model_id=model_ids.IBM_GRANITE_4_MICRO_3B, + model_id=model_ids.IBM_GRANITE_4_1_3B, model_options={ ModelOption.TEMPERATURE: 0.1, ModelOption.SEED: 42, @@ -193,7 +193,7 @@ print(str(response)) ``` > **Backend note:** Vision requires a model that supports image inputs. The default -> `granite4:micro` is text-only. Pull a vision model explicitly before using images: +> `granite4.1:3b` is text-only. Pull a vision model explicitly before using images: > `ollama pull ibm/granite3.3-vision:2b`. ## Ollama's OpenAI-compatible endpoint @@ -236,7 +236,7 @@ let Mellea pull it automatically on first use. Ollama loads the model into memory on the first request. Subsequent requests in the same session are much faster. On machines with less than 8 GB RAM, consider using -`granite4:micro` or `llama3.2:1b`. +`granite4.1:3b` or `llama3.2:1b`. ### Intel Mac torch errors diff --git a/docs/docs/integrations/smolagents.md b/docs/docs/integrations/smolagents.md index 992ab80f7..54e5f75a7 100644 --- a/docs/docs/integrations/smolagents.md +++ b/docs/docs/integrations/smolagents.md @@ -46,7 +46,7 @@ if result.tool_calls: description and parameter types are preserved exactly. > **Backend note:** Tool calling requires a backend and model that support function -> calling (e.g., Ollama with `granite4:micro`, OpenAI with `gpt-4o`). The default +> calling (e.g., Ollama with `granite4.1:3b`, OpenAI with `gpt-4o`). The default > Ollama setup supports this. > > **Full example:** [`docs/examples/tools/smolagents_example.py`](https://github.com/generative-computing/mellea/blob/main/docs/examples/tools/smolagents_example.py) diff --git a/docs/docs/observability/logging.md b/docs/docs/observability/logging.md index 903b2e56a..c565b406d 100644 --- a/docs/docs/observability/logging.md +++ b/docs/docs/observability/logging.md @@ -90,7 +90,7 @@ With structured JSON output enabled, the same `SUCCESS` record looks like: "thread_id": 6179762176, "session_id": "550e8400-e29b-41d4-a716-446655440000", "backend": "OllamaModelBackend", - "model_id": "granite4:micro", + "model_id": "granite4.1:3b", "strategy": "RejectionSamplingStrategy", "loop_budget": 3 } diff --git a/docs/docs/observability/metrics.md b/docs/docs/observability/metrics.md index 43ef1a256..237fd9894 100644 --- a/docs/docs/observability/metrics.md +++ b/docs/docs/observability/metrics.md @@ -459,7 +459,7 @@ from mellea.telemetry import create_counter, create_histogram, create_up_down_co # Monotonically increasing values requests = create_counter("myapp.requests", unit="1", description="Total requests") -requests.add(1, {"backend": "ollama", "model": "granite4:micro"}) +requests.add(1, {"backend": "ollama", "model": "granite4.1:3b"}) # Value distributions latency = create_histogram("myapp.latency", unit="ms", description="Request latency") diff --git a/docs/docs/observability/tracing.md b/docs/docs/observability/tracing.md index 617805fbe..96fba56c8 100644 --- a/docs/docs/observability/tracing.md +++ b/docs/docs/observability/tracing.md @@ -158,7 +158,7 @@ session_context (mellea.application) │ │ [mellea.backend=OllamaModelBackend] │ ├── chat (mellea.backend) │ │ [gen_ai.system=ollama] -│ │ [gen_ai.request.model=granite4:micro] +│ │ [gen_ai.request.model=granite4.1:3b] │ │ [gen_ai.usage.input_tokens=150] │ │ [gen_ai.usage.output_tokens=42] │ └── requirement_validation (mellea.application) diff --git a/docs/docs/troubleshooting/common-errors.md b/docs/docs/troubleshooting/common-errors.md index d0b1adbf4..deb025fbf 100644 --- a/docs/docs/troubleshooting/common-errors.md +++ b/docs/docs/troubleshooting/common-errors.md @@ -6,16 +6,16 @@ description: "Common errors, diagnostic steps, and fixes for Mellea programs." ## Installation -### `granite4:micro` not found +### `granite4.1:3b` not found ```text -Error: model "granite4:micro" not found +Error: model "granite4.1:3b" not found ``` Pull the model before running: ```bash -ollama pull granite4:micro +ollama pull granite4.1:3b ``` ### Python 3.13: `outlines` install failure diff --git a/docs/docs/troubleshooting/faq.md b/docs/docs/troubleshooting/faq.md index edf2a1171..fd7751536 100644 --- a/docs/docs/troubleshooting/faq.md +++ b/docs/docs/troubleshooting/faq.md @@ -38,7 +38,7 @@ m = MelleaSession( ) ``` -## How do I use a model other than `granite4:micro`? +## How do I use a model other than `granite4.1:3b`? Pass the `model_id` parameter to `start_session()`: diff --git a/docs/docs/tutorials/01-your-first-generative-program.md b/docs/docs/tutorials/01-your-first-generative-program.md index ea7b5beba..d92fc9ca2 100644 --- a/docs/docs/tutorials/01-your-first-generative-program.md +++ b/docs/docs/tutorials/01-your-first-generative-program.md @@ -20,7 +20,7 @@ By the end you will have covered: > see [Tutorial 03: Using Generative Stubs](../tutorials/03-using-generative-stubs). **Prerequisites:** [Quick Start](../getting-started/quickstart) complete, -Mellea installed (`uv add mellea`), Ollama running locally with `granite4:micro` downloaded. +Mellea installed (`uv add mellea`), Ollama running locally with `granite4.1:3b` downloaded. --- diff --git a/docs/docs/tutorials/02-streaming-and-async.md b/docs/docs/tutorials/02-streaming-and-async.md index ba3ad0c6b..9a210660f 100644 --- a/docs/docs/tutorials/02-streaming-and-async.md +++ b/docs/docs/tutorials/02-streaming-and-async.md @@ -16,7 +16,7 @@ By the end you will have covered: - Context behaviour with concurrent async calls **Prerequisites:** [Tutorial 01](./01-your-first-generative-program) complete, -`pip install mellea`, Ollama running locally with `granite4:micro` downloaded. +`pip install mellea`, Ollama running locally with `granite4.1:3b` downloaded. --- diff --git a/docs/docs/tutorials/03-using-generative-stubs.md b/docs/docs/tutorials/03-using-generative-stubs.md index 737cd0ccb..d2f33a2fa 100644 --- a/docs/docs/tutorials/03-using-generative-stubs.md +++ b/docs/docs/tutorials/03-using-generative-stubs.md @@ -16,7 +16,7 @@ By the end you will have covered: - Precondition and postcondition validation patterns **Prerequisites:** [Tutorial 01](./01-your-first-generative-program) complete, -`pip install mellea`, Ollama running locally with `granite4:micro` downloaded. +`pip install mellea`, Ollama running locally with `granite4.1:3b` downloaded. --- diff --git a/docs/docs/tutorials/04-making-agents-reliable.md b/docs/docs/tutorials/04-making-agents-reliable.md index 53296f032..81a21d1a4 100644 --- a/docs/docs/tutorials/04-making-agents-reliable.md +++ b/docs/docs/tutorials/04-making-agents-reliable.md @@ -18,7 +18,7 @@ By the end you will have covered: **Prerequisites:** [Tutorial 02](./02-streaming-and-async) and [Tutorial 03](./03-using-generative-stubs) complete, -`pip install mellea`, Ollama running locally with `granite4:micro` downloaded. +`pip install mellea`, Ollama running locally with `granite4.1:3b` downloaded. --- diff --git a/docs/docs/tutorials/05-mifying-legacy-code.md b/docs/docs/tutorials/05-mifying-legacy-code.md index 4e71db5bb..75891c7b3 100644 --- a/docs/docs/tutorials/05-mifying-legacy-code.md +++ b/docs/docs/tutorials/05-mifying-legacy-code.md @@ -18,7 +18,7 @@ By the end you will have covered: - Using `stringify_func` for custom text representations **Prerequisites:** [Tutorial 01](./01-your-first-generative-program) complete, -`pip install mellea`, Ollama running locally with `granite4:micro` downloaded. +`pip install mellea`, Ollama running locally with `granite4.1:3b` downloaded. --- diff --git a/docs/examples/aLora/README.md b/docs/examples/aLora/README.md index e066879dd..44ebe07af 100644 --- a/docs/examples/aLora/README.md +++ b/docs/examples/aLora/README.md @@ -19,7 +19,7 @@ Now let's train a model: ``` m alora train \ - --basemodel ibm-granite/granite-4.0-micro \ + --basemodel ibm-granite/granite-4.1-3b \ --outfile stembolts_model \ --adapter alora \ stembolt_failure_dataset.jsonl @@ -69,7 +69,7 @@ After uploading your adapter, you can auto-generate a README for the HuggingFace m alora add-readme \ --name $HF_USERNAME/stembolts \ --io-yaml io.yaml \ - --basemodel granite-4.0-micro \ + --basemodel granite-4.1-3b \ stembolt_failure_dataset.jsonl ``` @@ -79,7 +79,7 @@ You can provide a `--hints` file with additional domain context to improve the g m alora add-readme \ --name $HF_USERNAME/stembolts \ --io-yaml io.yaml \ - --basemodel granite-4.0-micro \ + --basemodel granite-4.1-3b \ --hints hints.txt stembolt_failure_dataset.jsonl ``` @@ -94,7 +94,7 @@ You can now create a new adapter class for this model somewhere in your python p from mellea.backends.adapters.adapter import CustomIntrinsicAdapter class StemboltAdapter(CustomIntrinsicAdapter): - def __init__(self, base_model_name:str="granite-4.0-micro"): + def __init__(self, base_model_name:str="granite-4.1-3b"): super().__init__( model_id="$USERNAME/stembolts", # REPLACE $USERNAME WITH YOUR HUGGINGFACE USERNAME intrinsic_name="stembolts", @@ -108,10 +108,10 @@ Using this adapter requires adding it to a backend: from mellea.backends.huggingface import LocalHFBackend backend = LocalHFBackend( - model_id="ibm-granite/granite-4.0-micro", cache=SimpleLRUCache(5) + model_id="ibm-granite/granite-4.1-3b", cache=SimpleLRUCache(5) ) -backend.add_adapter(StemboltAdapter(base_model_name="granite-4.0-micro")) +backend.add_adapter(StemboltAdapter(base_model_name="granite-4.1-3b")) ``` A full example of how to use this adapter as a requirement is found in `101_example.py` diff --git a/docs/examples/instruct_validate_repair/101_email.py b/docs/examples/instruct_validate_repair/101_email.py index ee5eee664..37b0dedce 100644 --- a/docs/examples/instruct_validate_repair/101_email.py +++ b/docs/examples/instruct_validate_repair/101_email.py @@ -23,7 +23,7 @@ # from mellea.stdlib.base import SimpleContext # m = MelleaSession( # backend=OllamaModelBackend( -# model_id=model_ids.IBM_GRANITE_4_MICRO_3B, +# model_id=model_ids.IBM_GRANITE_4_1_3B, # model_options={ModelOption.MAX_NEW_TOKENS: 200}, # ), # ctx=SimpleContext() diff --git a/docs/examples/m_serve/README.md b/docs/examples/m_serve/README.md index 70fcb5f5e..9abbaf7d7 100644 --- a/docs/examples/m_serve/README.md +++ b/docs/examples/m_serve/README.md @@ -112,7 +112,7 @@ client = openai.OpenAI(api_key="na", base_url="http://0.0.0.0:8080/v1") # Enable streaming with stream=True stream = client.chat.completions.create( messages=[{"role": "user", "content": "Tell me a story"}], - model="granite4:micro-h", + model="granite4.1:3b", stream=True, ) diff --git a/docs/examples/m_serve/client.py b/docs/examples/m_serve/client.py index 17652b4e3..f75170af4 100644 --- a/docs/examples/m_serve/client.py +++ b/docs/examples/m_serve/client.py @@ -7,7 +7,7 @@ response = client.chat.completions.create( messages=[{"role": "user", "content": "Find all the real roots of x^3 + 1."}], - model="granite4:micro-h", + model="granite4.1:3b", ) print(response.choices[0]) diff --git a/docs/examples/m_serve/client_streaming.py b/docs/examples/m_serve/client_streaming.py index 3606148ef..8153da1d6 100644 --- a/docs/examples/m_serve/client_streaming.py +++ b/docs/examples/m_serve/client_streaming.py @@ -28,7 +28,7 @@ messages=[ {"role": "user", "content": "Count down from 100 using words not digits."} ], - model="granite4:micro-h", + model="granite4.1:3b", stream=True, ) for chunk in stream_result: @@ -40,7 +40,7 @@ messages=[ {"role": "user", "content": "Count down from 100 using words not digits."} ], - model="granite4:micro-h", + model="granite4.1:3b", stream=False, ) print(completion_result.choices[0].message.content) diff --git a/docs/examples/m_serve/pii_serve.py b/docs/examples/m_serve/pii_serve.py index 7352617bf..effedf48e 100644 --- a/docs/examples/m_serve/pii_serve.py +++ b/docs/examples/m_serve/pii_serve.py @@ -5,7 +5,7 @@ import mellea from cli.serve.models import ChatMessage -from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B +from mellea.backends.model_ids import IBM_GRANITE_4_1_3B from mellea.core import ModelOutputThunk, SamplingResult from mellea.stdlib.requirements import req, simple_validate from mellea.stdlib.sampling import RejectionSamplingStrategy @@ -71,7 +71,7 @@ def pii_remove_validate( return "The Validation Failed" -session = mellea.start_session(model_id=IBM_GRANITE_4_MICRO_3B) +session = mellea.start_session(model_id=IBM_GRANITE_4_1_3B) def serve( diff --git a/docs/examples/mini_researcher/README.md b/docs/examples/mini_researcher/README.md index 610ea6ce8..ae63c707c 100644 --- a/docs/examples/mini_researcher/README.md +++ b/docs/examples/mini_researcher/README.md @@ -54,7 +54,7 @@ Result @cache def get_session(): """Get M session (change model here).""" - return MelleaSession(backend=OllamaModelBackend(model_ids.IBM_GRANITE_4_MICRO_3B)) + return MelleaSession(backend=OllamaModelBackend(model_ids.IBM_GRANITE_4_1_3B)) @cache def get_guardian_session(): diff --git a/docs/examples/mini_researcher/researcher.py b/docs/examples/mini_researcher/researcher.py index 708f3ba13..fd712de69 100644 --- a/docs/examples/mini_researcher/researcher.py +++ b/docs/examples/mini_researcher/researcher.py @@ -24,7 +24,7 @@ @cache def get_session(): """Get M session (change model here).""" - return MelleaSession(backend=OllamaModelBackend(model_ids.IBM_GRANITE_4_MICRO_3B)) + return MelleaSession(backend=OllamaModelBackend(model_ids.IBM_GRANITE_4_1_3B)) @cache diff --git a/docs/examples/notebooks/georgia_tech.ipynb b/docs/examples/notebooks/georgia_tech.ipynb index 95db80d93..bff0831bd 100644 --- a/docs/examples/notebooks/georgia_tech.ipynb +++ b/docs/examples/notebooks/georgia_tech.ipynb @@ -21,7 +21,7 @@ "\n", "Run the first cell during our introduction. The first cell will:\n", " * download an install ollama on your Colab instance\n", - " * download the `ibm/granite4:micro` model weights\n" + " * download the `granite4.1:3b` model weights\n" ] }, { @@ -40,7 +40,7 @@ "!nohup ollama serve >/dev/null 2>&1 &\n", "\n", "# Download the granite:3.3:8b weights.\n", - "!ollama pull ibm/granite4:micro\n", + "!ollama pull granite4.1:3b\n", "!ollama pull llama3.2:3b\n", "\n", "# install Mellea.\n", diff --git a/docs/examples/sofai/README.md b/docs/examples/sofai/README.md index 7bf33c34d..92da06806 100644 --- a/docs/examples/sofai/README.md +++ b/docs/examples/sofai/README.md @@ -51,7 +51,7 @@ from mellea.stdlib.sampling import SOFAISamplingStrategy from mellea.stdlib.requirements import req # Create fast and slow backends -s1_backend = OllamaModelBackend(model_id="granite4:micro") +s1_backend = OllamaModelBackend(model_id="granite4.1:3b") s2_backend = OllamaModelBackend(model_id="granite4:latest") # Create SOFAI strategy @@ -100,7 +100,7 @@ SOFAISamplingStrategy( ### Fast Models (S1) -- granite4:micro +- granite4.1:3b - llama3.2:3b - mistral:7b diff --git a/docs/examples/sofai/sofai_graph_coloring.py b/docs/examples/sofai/sofai_graph_coloring.py index 06a46189b..5821782e1 100644 --- a/docs/examples/sofai/sofai_graph_coloring.py +++ b/docs/examples/sofai/sofai_graph_coloring.py @@ -7,7 +7,7 @@ In this example, we use the SOFAI sampling strategy. Because we wrote this example to run on consumer grade hardware, each model is still relatively small: -1. S1 Solver (granite4:micro) - Fast model with iterative feedback loop +1. S1 Solver (granite4.1:3b) - Fast model with iterative feedback loop 2. S2 Solver (granite4:latest) - Slow model, called once on escalation 3. Custom validator - Provides detailed feedback for constraint violations @@ -142,7 +142,7 @@ def check_graph_coloring(ctx) -> ValidationResult: def main(): """Run the graph coloring example with SOFAI strategy.""" # Initialize backends - s1_solver_backend = OllamaModelBackend(model_id="granite4:micro") + s1_solver_backend = OllamaModelBackend(model_id="granite4.1:3b") s2_solver_backend = OllamaModelBackend(model_id="granite4:latest") # Optional: Initialize judge backend for LLM-as-Judge validation @@ -195,7 +195,7 @@ def main(): # Determine which solver was used if i < solver_1_attempts: - solver_name = "S1 Solver (granite4:micro)" + solver_name = "S1 Solver (granite4.1:3b)" else: solver_name = "S2 Solver (granite4:latest)" diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py index e49eee583..8bb49eebe 100644 --- a/mellea/backends/litellm.py +++ b/mellea/backends/litellm.py @@ -79,8 +79,7 @@ class LiteLLMBackend(FormatterBackend): def __init__( self, - model_id: str = "ollama_chat/" - + str(model_ids.IBM_GRANITE_4_MICRO_3B.ollama_name), + model_id: str = "ollama_chat/" + str(model_ids.IBM_GRANITE_4_1_3B.ollama_name), formatter: ChatFormatter | None = None, base_url: str | None = "http://localhost:11434", model_options: dict | None = None, diff --git a/mellea/backends/model_ids.py b/mellea/backends/model_ids.py index 09ce73acb..7fb52de47 100644 --- a/mellea/backends/model_ids.py +++ b/mellea/backends/model_ids.py @@ -74,6 +74,25 @@ class ModelIdentifier: watsonx_name=None, ) +# Granite 4.1 Dense Models +IBM_GRANITE_4_1_3B = ModelIdentifier( + hf_model_name="ibm-granite/granite-4.1-3b", + ollama_name="granite4.1:3b", + watsonx_name=None, +) + +IBM_GRANITE_4_1_8B = ModelIdentifier( + hf_model_name="ibm-granite/granite-4.1-8b", ollama_name="granite4.1:8b" +) + +IBM_GRANITE_4_1_30B = ModelIdentifier( + hf_model_name="ibm-granite/granite-4.1-30b", ollama_name="granite4.1:30b" +) + +IBM_GRANITE_GUARDIAN_4_1_8B = ModelIdentifier( + hf_model_name="ibm-granite/granite-guardian-4.1-8b" +) + # Deprecated Granite 3 models - kept for backward compatibility # These maintain their original model references (not upgraded to Granite 4) diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py index 3ff7ff1db..30482832a 100644 --- a/mellea/backends/ollama.py +++ b/mellea/backends/ollama.py @@ -62,7 +62,7 @@ class OllamaModelBackend(FormatterBackend): def __init__( self, - model_id: str | ModelIdentifier = model_ids.IBM_GRANITE_4_MICRO_3B, + model_id: str | ModelIdentifier = model_ids.IBM_GRANITE_4_1_3B, formatter: ChatFormatter | None = None, base_url: str | None = None, model_options: dict | None = None, diff --git a/mellea/plugins/hooks/session.py b/mellea/plugins/hooks/session.py index 321f154ee..024692228 100644 --- a/mellea/plugins/hooks/session.py +++ b/mellea/plugins/hooks/session.py @@ -31,7 +31,7 @@ class SessionPostInitPayload(MelleaBasePayload): Attributes: session_id: UUID string identifying this session. - model_id: Model identifier used by the backend (e.g. ``"granite4:micro"``). + model_id: Model identifier used by the backend (e.g. ``"granite4.1:3b"``). context: The initial ``Context`` instance for this session. """ diff --git a/mellea/stdlib/session.py b/mellea/stdlib/session.py index 0925567f0..5840706c1 100644 --- a/mellea/stdlib/session.py +++ b/mellea/stdlib/session.py @@ -18,7 +18,11 @@ from PIL import Image as PILImage -from ..backends.model_ids import IBM_GRANITE_4_MICRO_3B, ModelIdentifier +from ..backends.model_ids import ( + IBM_GRANITE_4_1_3B, + IBM_GRANITE_4_HYBRID_SMALL, + ModelIdentifier, +) from ..core import ( Backend, BaseModelSubclass, @@ -77,7 +81,7 @@ def get_session() -> MelleaSession: def start_session( backend_name: Literal["ollama", "hf", "openai", "watsonx", "litellm"] = "ollama", - model_id: str | ModelIdentifier = IBM_GRANITE_4_MICRO_3B, + model_id: str | ModelIdentifier = IBM_GRANITE_4_1_3B, ctx: Context | None = None, *, context_type: Literal["simple", "chat"] | None = None, @@ -98,7 +102,7 @@ def start_session( - "ollama": Use Ollama backend for local models - "hf" or "huggingface": Use HuggingFace transformers backend - "openai": Use OpenAI API backend - - "watsonx": Use IBM WatsonX backend + - "watsonx": Use IBM WatsonX backend, WARNING: this defaults to the IBM_GRANITE_4_HYBRID_SMALL model for now. - "litellm": Use the LiteLLM backend model_id: Model identifier or name. Can be a `ModelIdentifier` from mellea.backends.model_ids or a string model name. @@ -181,8 +185,23 @@ def start_session( model_id_str = pre_payload.model_id model_options = pre_payload.model_options - # Construct backend post-hook. - backend = backend_class(model_id, model_options=model_options, **backend_kwargs) + backend_class = backend_name_to_class(backend_name) + if backend_class is None: + raise Exception( + f"Backend name {backend_name} unknown. Please see the docstring for `mellea.stdlib.session.start_session` for a list of options." + ) + assert backend_class is not None + if "watsonx" in backend_name: + # Temp hack for watsonx for granite 4.1 + backend = backend_class( + IBM_GRANITE_4_HYBRID_SMALL.watsonx_name, + model_options=model_options, + **backend_kwargs, + ) + else: + backend = backend_class( + model_id, model_options=model_options, **backend_kwargs + ) logger.info( f"Starting Mellea session: backend={backend_name}, model={model_id_str}, " diff --git a/test/backends/test_litellm_ollama.py b/test/backends/test_litellm_ollama.py index 7b76196d4..eaaf69f2b 100644 --- a/test/backends/test_litellm_ollama.py +++ b/test/backends/test_litellm_ollama.py @@ -15,7 +15,7 @@ from mellea.stdlib.context import SimpleContext from mellea.stdlib.sampling import RejectionSamplingStrategy -_MODEL_ID = f"ollama_chat/{model_ids.IBM_GRANITE_4_HYBRID_MICRO.ollama_name}" +_MODEL_ID = f"ollama_chat/{model_ids.IBM_GRANITE_4_1_3B.ollama_name}" @pytest.fixture(scope="function") diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py index 9a8ac5de0..cfecc83f4 100644 --- a/test/backends/test_openai_ollama.py +++ b/test/backends/test_openai_ollama.py @@ -14,7 +14,7 @@ from mellea import MelleaSession from mellea.backends import ModelOption -from mellea.backends.model_ids import IBM_GRANITE_4_HYBRID_MICRO +from mellea.backends.model_ids import IBM_GRANITE_4_1_3B from mellea.backends.openai import OpenAIBackend from mellea.core import CBlock, ModelOutputThunk from mellea.formatters import TemplateFormatter @@ -25,8 +25,8 @@ def backend(gh_run: int): """Shared OpenAI backend configured for Ollama.""" return OpenAIBackend( - model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name, # type: ignore - formatter=TemplateFormatter(model_id=IBM_GRANITE_4_HYBRID_MICRO.hf_model_name), # type: ignore + model_id=IBM_GRANITE_4_1_3B.ollama_name, # type: ignore + formatter=TemplateFormatter(model_id=IBM_GRANITE_4_1_3B.hf_model_name), # type: ignore base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1", api_key="ollama", ) diff --git a/test/backends/test_openai_vllm.py b/test/backends/test_openai_vllm.py index 36512f496..1d2e5813e 100644 --- a/test/backends/test_openai_vllm.py +++ b/test/backends/test_openai_vllm.py @@ -28,7 +28,7 @@ import mellea.backends.model_ids as model_ids from mellea import MelleaSession from mellea.backends import ModelOption -from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B +from mellea.backends.model_ids import IBM_GRANITE_4_1_3B from mellea.backends.openai import OpenAIBackend from mellea.core import CBlock, ModelOutputThunk from mellea.formatters import TemplateFormatter @@ -72,9 +72,9 @@ def vllm_process(): "-m", "vllm.entrypoints.openai.api_server", "--model", - IBM_GRANITE_4_MICRO_3B.hf_model_name, + IBM_GRANITE_4_1_3B.hf_model_name, "--served-model-name", - IBM_GRANITE_4_MICRO_3B.hf_model_name, + IBM_GRANITE_4_1_3B.hf_model_name, "--enable-lora", "--dtype", "bfloat16", @@ -166,8 +166,8 @@ def backend(gh_run: int, vllm_process: subprocess.Popen): """Shared OpenAI backend configured for vLLM.""" base_url = os.environ.get("VLLM_TEST_BASE_URL", "http://127.0.0.1:8000") + "/v1" return OpenAIBackend( - model_id=IBM_GRANITE_4_MICRO_3B.hf_model_name, # type: ignore - formatter=TemplateFormatter(model_id=IBM_GRANITE_4_MICRO_3B.hf_model_name), # type: ignore + model_id=IBM_GRANITE_4_1_3B.hf_model_name, # type: ignore + formatter=TemplateFormatter(model_id=IBM_GRANITE_4_1_3B.hf_model_name), # type: ignore base_url=base_url, api_key="EMPTY", ) diff --git a/test/backends/test_tool_calls.py b/test/backends/test_tool_calls.py index f67f2cdde..896d5ee9b 100644 --- a/test/backends/test_tool_calls.py +++ b/test/backends/test_tool_calls.py @@ -61,6 +61,9 @@ def test2(): ... assert "to_markdown" in tools +@pytest.mark.xfail( + reason="We don't force tools to be called. As a result, this test might unexpectedly fail." +) def test_tool_called(m: MelleaSession, table: Table): """We don't force tools to be called. As a result, this test might unexpectedly fail.""" r = 10 diff --git a/test/backends/test_vision_openai.py b/test/backends/test_vision_openai.py index 609346b44..2e20d61cf 100644 --- a/test/backends/test_vision_openai.py +++ b/test/backends/test_vision_openai.py @@ -11,7 +11,7 @@ from mellea import MelleaSession, start_session from mellea.backends import ModelOption -from mellea.backends.model_ids import IBM_GRANITE_4_HYBRID_MICRO +from mellea.backends.model_ids import IBM_GRANITE_4_1_3B from mellea.core import ImageBlock, ModelOutputThunk from mellea.stdlib.components import Instruction, Message @@ -21,7 +21,7 @@ def m_session(gh_run): if gh_run == 1: m = start_session( "openai", - model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name, # type: ignore + model_id=IBM_GRANITE_4_1_3B.ollama_name, # type: ignore base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1", api_key="ollama", model_options={ModelOption.MAX_NEW_TOKENS: 5}, diff --git a/test/cli/test_alora_train_integration.py b/test/cli/test_alora_train_integration.py index baa5b4857..b2666cb8c 100644 --- a/test/cli/test_alora_train_integration.py +++ b/test/cli/test_alora_train_integration.py @@ -45,7 +45,7 @@ def test_alora_training_integration(): 3. Verifies adapter files are created with correct PEFT 0.18+ format 4. Cleans up temporary files - Uses ibm-granite/granite-4.0-micro (smallest Granite model, 3B params). + Uses ibm-granite/granite-4.1-3b (smallest Granite model, 3B params). """ from cli.alora.train import train_model @@ -82,10 +82,10 @@ def test_alora_training_integration(): adapter_path = tmpdir_path / "test_alora_adapter" # Train aLoRA adapter with minimal settings - # Using smallest Granite model: granite-4.0-micro (3B params) + # Using smallest Granite model: granite-4.1-3b (3B params) train_model( dataset_path=str(dataset_path), - base_model="ibm-granite/granite-4.0-micro", + base_model="ibm-granite/granite-4.1-3b", output_file=str(adapter_path), adapter="alora", epochs=1, # Just 1 epoch for speed @@ -186,7 +186,7 @@ def test_alora_training_integration(): # Additional verification: Verify invocation tokens are correct # The default invocation prompt is "<|start_of_role|>check_requirement<|end_of_role|>" - tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-4.0-micro") + tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granite-4.1-3b") default_invocation_prompt = "<|start_of_role|>check_requirement<|end_of_role|>" expected_tokens = tokenizer.encode( default_invocation_prompt, add_special_tokens=False @@ -204,9 +204,7 @@ def test_alora_training_integration(): from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained( - "ibm-granite/granite-4.0-micro", - device_map="auto", - torch_dtype=torch.bfloat16, + "ibm-granite/granite-4.1-3b", device_map="auto", torch_dtype=torch.bfloat16 ) # Load the trained adapter @@ -350,7 +348,7 @@ def test_lora_training_integration(): # Train standard LoRA adapter train_model( dataset_path=str(dataset_path), - base_model="ibm-granite/granite-4.0-micro", + base_model="ibm-granite/granite-4.1-3b", output_file=str(adapter_path), adapter="lora", # Standard LoRA, not aLoRA epochs=1, diff --git a/test/conftest.py b/test/conftest.py index cdea5cb4f..cf6fed7ae 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -34,7 +34,7 @@ def _check_ollama_available(): """Check if Ollama is available by checking if port 11434 is listening. Note: This only checks if Ollama is running, not which models are loaded. - Tests may still fail if required models (e.g., granite4:micro) are not pulled. + Tests may still fail if required models (e.g., granite4.1:3b) are not pulled. """ import socket @@ -562,7 +562,7 @@ def pytest_runtest_setup(item): logger.info( "Warming up ollama models before ollama group (keep_alive=-1)..." ) - for model in ["granite4:micro", "granite4:micro-h", "granite3.2-vision"]: + for model in ["granite4.1:3b", "granite3.2-vision"]: try: requests.post( f"{ollama_base}/api/generate", @@ -588,7 +588,7 @@ def pytest_runtest_setup(item): port = os.environ.get("OLLAMA_PORT", "11434") ollama_base = f"http://{host_str}:{port}" logger.info("Evicting ollama models from VRAM after ollama group...") - for model in ["granite4:micro", "granite4:micro-h", "granite3.2-vision"]: + for model in ["granite4.1:3b", "granite3.2-vision"]: try: requests.post( f"{ollama_base}/api/generate", diff --git a/test/core/test_component_typing.py b/test/core/test_component_typing.py index bbc3de9ef..f8d3d411d 100644 --- a/test/core/test_component_typing.py +++ b/test/core/test_component_typing.py @@ -6,7 +6,7 @@ import mellea.stdlib.functional as mfuncs from mellea import MelleaSession, start_session -from mellea.backends.model_ids import IBM_GRANITE_4_HYBRID_MICRO +from mellea.backends.model_ids import IBM_GRANITE_4_1_3B from mellea.backends.ollama import OllamaModelBackend from mellea.core import ( CBlock, @@ -65,10 +65,10 @@ def backend(gh_run: int): """Shared backend.""" if gh_run == 1: return OllamaModelBackend( - model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name # type: ignore + model_id=IBM_GRANITE_4_1_3B.ollama_name # type: ignore ) else: - return OllamaModelBackend(model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name) # type: ignore + return OllamaModelBackend(model_id=IBM_GRANITE_4_1_3B.ollama_name) # type: ignore @pytest.fixture(scope="module") @@ -118,7 +118,7 @@ def test_incorrect_type_override(): # Marking as qualitative for now since there's so much generation required for this. -# Uses granite4:micro-h (3B hybrid, lightweight) in local mode +# Uses granite4.1:3b (3B hybrid, lightweight) in local mode @pytest.mark.qualitative @pytest.mark.ollama @pytest.mark.e2e diff --git a/test/formatters/test_template_formatter.py b/test/formatters/test_template_formatter.py index 5dd995b89..5275807c9 100644 --- a/test/formatters/test_template_formatter.py +++ b/test/formatters/test_template_formatter.py @@ -5,7 +5,7 @@ import pytest -from mellea.backends.model_ids import IBM_GRANITE_4_HYBRID_MICRO, ModelIdentifier +from mellea.backends.model_ids import IBM_GRANITE_4_1_3B, ModelIdentifier from mellea.core import CBlock, Component, ModelOutputThunk, TemplateRepresentation from mellea.formatters import TemplateFormatter from mellea.stdlib.components import Instruction, Message, MObject @@ -173,7 +173,7 @@ def _parse(self, computed: ModelOutputThunk) -> str: def test_load_with_model_id(instr: Instruction): - tf = TemplateFormatter(IBM_GRANITE_4_HYBRID_MICRO) + tf = TemplateFormatter(IBM_GRANITE_4_1_3B) tmpl = tf._load_template(instr.format_for_llm()) assert tmpl.name is not None assert "granite" in tmpl.name, ( diff --git a/test/plugins/test_all_payloads.py b/test/plugins/test_all_payloads.py index b1b93401a..96814077a 100644 --- a/test/plugins/test_all_payloads.py +++ b/test/plugins/test_all_payloads.py @@ -77,13 +77,13 @@ def test_defaults(self): def test_construction_with_values(self): payload = SessionPostInitPayload( session_id="s-001", - model_id="granite4:micro", + model_id="granite4.1:3b", context=_SENTINEL_CONTEXT, request_id="r-001", hook="session_post_init", ) assert payload.session_id == "s-001" - assert payload.model_id == "granite4:micro" + assert payload.model_id == "granite4.1:3b" assert payload.context is _SENTINEL_CONTEXT assert payload.request_id == "r-001" assert payload.hook == "session_post_init" diff --git a/test/scripts/run_tests_with_ollama_and_vllm.sh b/test/scripts/run_tests_with_ollama_and_vllm.sh index 5d95d84ab..82ec24d0b 100755 --- a/test/scripts/run_tests_with_ollama_and_vllm.sh +++ b/test/scripts/run_tests_with_ollama_and_vllm.sh @@ -40,8 +40,7 @@ else fi OLLAMA_BIN="${OLLAMA_BIN:-$(command -v ollama 2>/dev/null || echo "$HOME/.local/bin/ollama")}" OLLAMA_MODEL_LIST=( - "granite4:micro" - "granite4:micro-h" + "granite4.1:3b" "granite3.2-vision" "llama3.2" "qwen2.5vl:7b" @@ -59,7 +58,7 @@ if [[ -z "${WITH_VLLM:-}" ]]; then fi fi VLLM_PORT="${VLLM_PORT:-8100}" -VLLM_MODEL="${VLLM_MODEL:-ibm-granite/granite-4.0-micro}" +VLLM_MODEL="${VLLM_MODEL:-ibm-granite/granite-4.1-3b}" VLLM_GPU_MEM="${VLLM_GPU_MEM:-0.4}" VLLM_MAX_MODEL_LEN="${VLLM_MAX_MODEL_LEN:-4096}" VLLM_MAX_NUM_SEQS="${VLLM_MAX_NUM_SEQS:-256}" diff --git a/test/stdlib/components/test_genstub.py b/test/stdlib/components/test_genstub.py index bdc863fac..367563b18 100644 --- a/test/stdlib/components/test_genstub.py +++ b/test/stdlib/components/test_genstub.py @@ -4,7 +4,7 @@ import pytest from mellea import MelleaSession, generative, start_session -from mellea.backends.model_ids import IBM_GRANITE_4_HYBRID_MICRO +from mellea.backends.model_ids import IBM_GRANITE_4_1_3B from mellea.backends.ollama import OllamaModelBackend from mellea.core import Requirement from mellea.stdlib.components.genstub import ( @@ -17,7 +17,7 @@ from mellea.stdlib.requirements import simple_validate from mellea.stdlib.sampling import RejectionSamplingStrategy -# Module-level markers: Uses granite4:micro-h (3B hybrid, lightweight) in local mode +# Module-level markers: Uses granite4.1:3b (3B, lightweight) in local mode pytestmark = [pytest.mark.ollama, pytest.mark.e2e] @@ -26,10 +26,10 @@ def backend(gh_run: int): """Shared backend.""" if gh_run == 1: return OllamaModelBackend( - model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name # type: ignore + model_id=IBM_GRANITE_4_1_3B.ollama_name # type: ignore ) else: - return OllamaModelBackend(model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name) # type: ignore + return OllamaModelBackend(model_id=IBM_GRANITE_4_1_3B.ollama_name) # type: ignore @generative diff --git a/test/stdlib/test_spans.py b/test/stdlib/test_spans.py index 969e5a29e..7914fa02a 100644 --- a/test/stdlib/test_spans.py +++ b/test/stdlib/test_spans.py @@ -6,13 +6,13 @@ "llguidance", reason="llguidance not installed — install mellea[hf]" ) from mellea.backends.huggingface import LocalHFBackend -from mellea.backends.model_ids import IBM_GRANITE_4_HYBRID_MICRO +from mellea.backends.model_ids import IBM_GRANITE_4_1_3B from mellea.core import CBlock from mellea.stdlib.components import SimpleComponent from mellea.stdlib.session import MelleaSession, start_session from test.predicates import require_gpu -# Module-level markers for all tests using Granite 4 hybrid micro (3B model) +# Module-level markers for all tests using Granite 4.1 3B model pytestmark = [pytest.mark.huggingface, require_gpu(min_vram_gb=12), pytest.mark.e2e] @@ -21,7 +21,7 @@ def m_session(gh_run): m = start_session( "hf", - model_id=IBM_GRANITE_4_HYBRID_MICRO, + model_id=IBM_GRANITE_4_1_3B, model_options={ModelOption.MAX_NEW_TOKENS: 64}, ) yield m diff --git a/test/telemetry/test_backend_instrumentation.py b/test/telemetry/test_backend_instrumentation.py index 4163ccb8d..794d2ee4e 100644 --- a/test/telemetry/test_backend_instrumentation.py +++ b/test/telemetry/test_backend_instrumentation.py @@ -39,9 +39,9 @@ def test_get_model_id_str_plain_string(): def test_get_model_id_str_hf_model_name(): backend = _BackendWithHFModelId( - model_id=_HFModelId(hf_model_name="ibm-granite/granite-4.0-micro") + model_id=_HFModelId(hf_model_name="ibm-granite/granite-4.1-3b") ) - assert get_model_id_str(backend) == "ibm-granite/granite-4.0-micro" + assert get_model_id_str(backend) == "ibm-granite/granite-4.1-3b" def test_get_model_id_str_no_model_id_returns_class_name(): diff --git a/test/telemetry/test_metrics_backend.py b/test/telemetry/test_metrics_backend.py index 34374d917..c538accd2 100644 --- a/test/telemetry/test_metrics_backend.py +++ b/test/telemetry/test_metrics_backend.py @@ -7,10 +7,7 @@ import pytest -from mellea.backends.model_ids import ( - IBM_GRANITE_4_HYBRID_MICRO, - IBM_GRANITE_4_HYBRID_SMALL, -) +from mellea.backends.model_ids import IBM_GRANITE_4_1_3B, IBM_GRANITE_4_HYBRID_SMALL from mellea.plugins.manager import ( disable_background_collection, discard_background_tasks, @@ -76,7 +73,7 @@ def hf_metrics_backend(gh_run): from mellea.backends.huggingface import LocalHFBackend backend = LocalHFBackend( - model_id=IBM_GRANITE_4_HYBRID_MICRO.hf_model_name, # type: ignore + model_id=IBM_GRANITE_4_1_3B.hf_model_name, # type: ignore cache=SimpleLRUCache(5), ) @@ -163,7 +160,7 @@ async def test_ollama_token_metrics_integration(enable_metrics, metric_reader, s provider = _setup_metrics_provider(metrics_module, metric_reader) - backend = OllamaModelBackend(model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name) # type: ignore + backend = OllamaModelBackend(model_id=IBM_GRANITE_4_1_3B.ollama_name) # type: ignore ctx = SimpleContext() ctx = ctx.add(Message(role="user", content="Say 'hello' and nothing else")) @@ -226,7 +223,7 @@ async def test_openai_token_metrics_integration(enable_metrics, metric_reader, s # Use Ollama's OpenAI-compatible endpoint backend = OpenAIBackend( - model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name, # type: ignore + model_id=IBM_GRANITE_4_1_3B.ollama_name, # type: ignore base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1", api_key="ollama", ) @@ -351,9 +348,7 @@ async def test_litellm_token_metrics_integration( # Use LiteLLM with openai/ prefix - it will use the OPENAI_BASE_URL env var # This tests LiteLLM with a provider that properly returns token usage - backend = LiteLLMBackend( - model_id=f"openai/{IBM_GRANITE_4_HYBRID_MICRO.ollama_name}" - ) # type: ignore + backend = LiteLLMBackend(model_id=f"openai/{IBM_GRANITE_4_1_3B.ollama_name}") # type: ignore ctx = SimpleContext() ctx = ctx.add(Message(role="user", content="Say 'hello' and nothing else")) @@ -510,7 +505,7 @@ async def test_ollama_sampling_metrics_integration(enable_metrics, metric_reader provider = _setup_metrics_provider(metrics_module, metric_reader) - backend = OllamaModelBackend(model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name) # type: ignore + backend = OllamaModelBackend(model_id=IBM_GRANITE_4_1_3B.ollama_name) # type: ignore strategy = RejectionSamplingStrategy(loop_budget=1) ctx = SimpleContext() diff --git a/test/telemetry/test_tracing_backend.py b/test/telemetry/test_tracing_backend.py index 9d4f3f6d2..a028cb284 100644 --- a/test/telemetry/test_tracing_backend.py +++ b/test/telemetry/test_tracing_backend.py @@ -4,10 +4,7 @@ import pytest -from mellea.backends.model_ids import ( - IBM_GRANITE_4_HYBRID_MICRO, - IBM_GRANITE_4_HYBRID_SMALL, -) +from mellea.backends.model_ids import IBM_GRANITE_4_1_3B, IBM_GRANITE_4_HYBRID_SMALL from mellea.backends.ollama import OllamaModelBackend from mellea.stdlib.components import Message from mellea.stdlib.context import SimpleContext @@ -74,7 +71,7 @@ def span_exporter(): async def test_span_duration_captures_async_operation(span_exporter): """Test that span duration includes the full async operation time.""" - backend = OllamaModelBackend(model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name) # type: ignore + backend = OllamaModelBackend(model_id=IBM_GRANITE_4_1_3B.ollama_name) # type: ignore ctx = SimpleContext() ctx = ctx.add(Message(role="user", content="Say 'test' and nothing else")) @@ -110,7 +107,7 @@ async def test_span_duration_captures_async_operation(span_exporter): async def test_context_propagation_parent_child(span_exporter): """Test that parent-child span relationships are maintained.""" - backend = OllamaModelBackend(model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name) # type: ignore + backend = OllamaModelBackend(model_id=IBM_GRANITE_4_1_3B.ollama_name) # type: ignore ctx = SimpleContext() ctx = ctx.add(Message(role="user", content="Say 'test' and nothing else")) @@ -157,7 +154,7 @@ async def test_context_propagation_parent_child(span_exporter): async def test_token_usage_recorded_after_completion(span_exporter): """Test that token usage metrics are recorded after async completion.""" - backend = OllamaModelBackend(model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name) # type: ignore + backend = OllamaModelBackend(model_id=IBM_GRANITE_4_1_3B.ollama_name) # type: ignore ctx = SimpleContext() ctx = ctx.add(Message(role="user", content="Say 'test' and nothing else")) @@ -206,7 +203,7 @@ async def test_token_usage_recorded_after_completion(span_exporter): async def test_span_not_closed_prematurely(span_exporter): """Test that spans are not closed before async operations complete.""" - backend = OllamaModelBackend(model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name) # type: ignore + backend = OllamaModelBackend(model_id=IBM_GRANITE_4_1_3B.ollama_name) # type: ignore ctx = SimpleContext() ctx = ctx.add(Message(role="user", content="Count to 5")) @@ -240,7 +237,7 @@ async def test_span_not_closed_prematurely(span_exporter): async def test_multiple_generations_separate_spans(span_exporter): """Test that multiple generations create separate spans.""" - backend = OllamaModelBackend(model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name) # type: ignore + backend = OllamaModelBackend(model_id=IBM_GRANITE_4_1_3B.ollama_name) # type: ignore ctx = SimpleContext() ctx = ctx.add(Message(role="user", content="Say 'test'")) @@ -274,7 +271,7 @@ async def test_streaming_span_duration(span_exporter): from mellea.backends.model_options import ModelOption - backend = OllamaModelBackend(model_id=IBM_GRANITE_4_HYBRID_MICRO.ollama_name) # type: ignore + backend = OllamaModelBackend(model_id=IBM_GRANITE_4_1_3B.ollama_name) # type: ignore ctx = SimpleContext() ctx = ctx.add(Message(role="user", content="Count to 3"))