| 
 | 1 | +<!--Copyright 2025 the HuggingFace Team. All rights reserved.  | 
 | 2 | +
  | 
 | 3 | +Licensed under the Apache License, Version 2.0 (the "License");  | 
 | 4 | +you may not use this file except in compliance with the License.  | 
 | 5 | +You may obtain a copy of the License at  | 
 | 6 | +
  | 
 | 7 | +    http://www.apache.org/licenses/LICENSE-2.0  | 
 | 8 | +
  | 
 | 9 | +Unless required by applicable law or agreed to in writing, software  | 
 | 10 | +distributed under the License is distributed on an "AS IS" BASIS,  | 
 | 11 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
 | 12 | +See the License for the specific language governing permissions and  | 
 | 13 | +limitations under the License.  | 
 | 14 | +
  | 
 | 15 | +
  | 
 | 16 | +⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer.  | 
 | 17 | +
  | 
 | 18 | +-->  | 
 | 19 | +*This model was released on 2025-07-09 and added to Hugging Face Transformers on 2025-09-15.*  | 
 | 20 | +<div style="float: right;">  | 
 | 21 | +    <div class="flex flex-wrap space-x-1">  | 
 | 22 | +        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">  | 
 | 23 | +        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">  | 
 | 24 | +        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">  | 
 | 25 | +    </div>  | 
 | 26 | +</div>  | 
 | 27 | + | 
 | 28 | +# FlexOlmo  | 
 | 29 | + | 
 | 30 | +[FlexOlmo](https://huggingface.co/papers/2507.07024) is a new class of language models (LMs) that supports (1) distributed training without data sharing, where different model parameters are independently trained on closed datasets, and (2) data-flexible inference, where these parameters along with their associated data can be flexibly included or excluded from model inferences with no further training. FlexOlmo employs a mixture-of-experts (MoE) architecture where each expert is trained independently on closed datasets and later integrated through a new domain-informed routing without any joint training. FlexOlmo is trained on FlexMix, a corpus we curate comprising publicly available datasets alongside seven domain-specific sets, representing realistic approximations of closed sets.   | 
 | 31 | + | 
 | 32 | +You can find all the original FlexOlmo checkpoints under the [FlexOlmo](https://huggingface.co/collections/allenai/flexolmo-68471177a386b6e20a54c55f) collection.  | 
 | 33 | + | 
 | 34 | +> [!TIP]  | 
 | 35 | +> Click on the FlexOlmo models in the right sidebar for more examples of how to apply FlexOlmo to different language tasks.  | 
 | 36 | +
  | 
 | 37 | +The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`] and from the command line.  | 
 | 38 | + | 
 | 39 | +<hfoptions id="usage">  | 
 | 40 | +<hfoption id="Pipeline">  | 
 | 41 | + | 
 | 42 | +```py  | 
 | 43 | +import torch  | 
 | 44 | +from transformers import pipeline  | 
 | 45 | + | 
 | 46 | +pipe = pipeline(  | 
 | 47 | +    task="text-generation",  | 
 | 48 | +    model="allenai/FlexOlmo-7x7B-1T",  | 
 | 49 | +    dtype=torch.bfloat16,  | 
 | 50 | +    device=0,  | 
 | 51 | +)  | 
 | 52 | +      | 
 | 53 | +result = pipe("Plants create energy through a process known as")  | 
 | 54 | +print(result)  | 
 | 55 | +```  | 
 | 56 | + | 
 | 57 | +</hfoption>  | 
 | 58 | +<hfoption id="AutoModel">  | 
 | 59 | + | 
 | 60 | +```py  | 
 | 61 | +import torch  | 
 | 62 | +from transformers import AutoModelForCausalLM, AutoTokenizer  | 
 | 63 | + | 
 | 64 | +tokenizer = AutoTokenizer.from_pretrained(  | 
 | 65 | +    "allenai/FlexOlmo-7x7B-1T"  | 
 | 66 | +)  | 
 | 67 | + | 
 | 68 | +model = AutoModelForCausalLM.from_pretrained(  | 
 | 69 | +    "allenai/FlexOlmo-7x7B-1T",  | 
 | 70 | +    dtype=torch.bfloat16,  | 
 | 71 | +    device_map="auto",  | 
 | 72 | +    attn_implementation="sdpa"  | 
 | 73 | +)  | 
 | 74 | +input_ids = tokenizer("Plants create energy through a process known as", return_tensors="pt").to(model.device)  | 
 | 75 | + | 
 | 76 | +output = model.generate(**input_ids, max_length=50, cache_implementation="static")  | 
 | 77 | +print(tokenizer.decode(output[0], skip_special_tokens=True))  | 
 | 78 | +```  | 
 | 79 | + | 
 | 80 | +</hfoption>  | 
 | 81 | +<hfoption id="transformers CLI">  | 
 | 82 | + | 
 | 83 | +```bash  | 
 | 84 | +echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model allenai/FlexOlmo-7x7B-1T --device 0  | 
 | 85 | +```  | 
 | 86 | + | 
 | 87 | +</hfoption>  | 
 | 88 | +</hfoptions>  | 
 | 89 | + | 
 | 90 | +Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.  | 
 | 91 | + | 
 | 92 | +The example below uses [torchao](../quantization/torchao) to only quantize the weights to 4-bits.  | 
 | 93 | +```py  | 
 | 94 | + | 
 | 95 | +#pip install torchao  | 
 | 96 | +import torch  | 
 | 97 | +from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig  | 
 | 98 | + | 
 | 99 | +torchao_config = TorchAoConfig(  | 
 | 100 | +    "int4_weight_only",  | 
 | 101 | +    group_size=128  | 
 | 102 | +)  | 
 | 103 | + | 
 | 104 | +tokenizer = AutoTokenizer.from_pretrained(  | 
 | 105 | +    "allenai/FlexOlmo-7x7B-1T"  | 
 | 106 | +)  | 
 | 107 | + | 
 | 108 | +model = AutoModelForCausalLM.from_pretrained(  | 
 | 109 | +    "allenai/FlexOlmo-7x7B-1T",  | 
 | 110 | +    quantization_config=torchao_config,  | 
 | 111 | +    dtype=torch.bfloat16,  | 
 | 112 | +    device_map="auto",  | 
 | 113 | +    attn_implementation="sdpa"  | 
 | 114 | +)  | 
 | 115 | +input_ids = tokenizer("Plants create energy through a process known as", return_tensors="pt").to(model.device)  | 
 | 116 | + | 
 | 117 | +output = model.generate(**input_ids, max_length=50, cache_implementation="static")  | 
 | 118 | +print(tokenizer.decode(output[0], skip_special_tokens=True))  | 
 | 119 | + | 
 | 120 | +```  | 
 | 121 | + | 
 | 122 | + | 
 | 123 | +## FlexOlmoConfig  | 
 | 124 | + | 
 | 125 | +[[autodoc]] FlexOlmoConfig  | 
 | 126 | + | 
 | 127 | +## FlexOlmoForCausalLM  | 
 | 128 | + | 
 | 129 | +[[autodoc]] FlexOlmoForCausalLM  | 
 | 130 | + | 
 | 131 | +## FlexOlmoModel  | 
 | 132 | + | 
 | 133 | +[[autodoc]] FlexOlmoModel  | 
 | 134 | +    - forward  | 
 | 135 | + | 
 | 136 | +## FlexOlmoPreTrainedModel  | 
 | 137 | + | 
 | 138 | +[[autodoc]] FlexOlmoPreTrainedModel  | 
 | 139 | +    - forward  | 
0 commit comments