Skip to content

Commit 0e4d3a0

Browse files
ralphbeanclaudebrian-dellabetta
authored
[Cache] Fix environment variable handling for offline mode (#1902)
SUMMARY: Previously, llm-compressor ignored HF_HUB_CACHE and other environment variables when loading models and datasets, making offline mode difficult to use with unified cache directories. This change: - Removes hard-coded TRANSFORMERS_CACHE in model_load/helpers.py to respect HF_HOME, HF_HUB_CACHE environment variables - Propagates cache_dir from model_args to dataset_args to enable unified cache directory for both models and datasets - Updates dataset loading to use cache_dir parameter instead of hardcoded None Now users can specify cache_dir parameter or use HF_HOME/HF_HUB_CACHE environment variables for true offline operation. Offline mode is super helpful to supply-chain security use cases. It helps us generate trustworthy SBOMs for AI stuff. 🔐 🧠 TEST PLAN: I start with the oneshot example from the README, and called it `example.py`: ```python """ This is the example from the README """ from llmcompressor.modifiers.smoothquant import SmoothQuantModifier from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor import oneshot recipe = [ SmoothQuantModifier(smoothing_strength=0.8), GPTQModifier(scheme="W8A8", targets="Linear", ignore=["lm_head"]), ] oneshot( model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", dataset="open_platypus", recipe=recipe, output_dir="TinyLlama-1.1B-Chat-v1.0-INT8", max_seq_length=2048, num_calibration_samples=512, ) ``` Next, remove your hf local cache to ensure your system has nothing available to it yet: ```bash ❯ rm -rf ~/.cache/huggingface ``` Then, run `example.py` with the HF_HUB_OFFLINE=1 env var. This should fail, proving that you have nothing cached. ```bash ❯ HF_HUB_OFFLINE=1 python example.py Traceback (most recent call last): File "/home/rbean/code/llm-compressor/testtest/lib64/python3.13/site-packages/transformers/utils/hub.py", line 479, in cached_files ... <snip> ... OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files. Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. ``` Good. Now, run it with `HF_HUB=./hf-hub` which will run it in online mode, populating the cache in a new non-standard location (just to be sure things don't get mixed up during our test): ```bash ❯ HF_HOME=./hf-hub python example.py <lots of downloading happens, but you can ctrl-C when it gets into the real compression work> ``` Now, finally, you can run with both HF_HOME and HF_HUB_OFFLINE=1 and prove to yourself that llm-compressor uses that freshly-populated cache for both the model and the dataset. ```bash ❯ HF_HOME=./hf-hub HF_HUB_OFFLINE=1 python example.py <it works!> ``` --------- Signed-off-by: Ralph Bean <rbean@redhat.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
1 parent 51ff37d commit 0e4d3a0

File tree

6 files changed

+9
-16
lines changed

6 files changed

+9
-16
lines changed

src/llmcompressor/args/model_arguments.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,6 @@ class ModelArguments:
5050
"help": "Pretrained processor name or path if not the same as model_name"
5151
},
5252
)
53-
cache_dir: str | None = field(
54-
default=None,
55-
metadata={"help": "Where to store the pretrained data from huggingface.co"},
56-
)
5753

5854
use_auth_token: bool = field(
5955
default=False,

src/llmcompressor/entrypoints/oneshot.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,6 @@ def oneshot(
231231
config_name: Optional[str] = None,
232232
tokenizer: Optional[Union[str, PreTrainedTokenizerBase]] = None,
233233
processor: Optional[Union[str, ProcessorMixin]] = None,
234-
cache_dir: Optional[str] = None,
235234
use_auth_token: bool = False,
236235
precision: str = "auto",
237236
tie_word_embeddings: bool = False,
@@ -279,8 +278,6 @@ def oneshot(
279278
model_name.
280279
:param processor: Pretrained processor name or path if not the same as
281280
model_name.
282-
:param cache_dir: Where to store the pretrained data from
283-
huggingface.co.
284281
:param use_auth_token: Whether to use Hugging Face auth token for private
285282
models.
286283
:param precision: Precision to cast model weights to, default to auto.

src/llmcompressor/entrypoints/utils.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def initialize_model_from_path(
175175
model_path = model_args.model
176176
config = AutoConfig.from_pretrained(
177177
model_args.config_name if model_args.config_name else model_path,
178-
cache_dir=model_args.cache_dir,
178+
cache_dir=None,
179179
revision=model_args.model_revision,
180180
use_auth_token=True if model_args.use_auth_token else None,
181181
trust_remote_code=model_args.trust_remote_code_model,
@@ -211,7 +211,7 @@ def initialize_model_from_path(
211211
)
212212
teacher_kwargs = {
213213
"config": teacher_config,
214-
"cache_dir": model_args.cache_dir,
214+
"cache_dir": None,
215215
"use_auth_token": True if model_args.use_auth_token else None,
216216
"torch_dtype": parse_dtype(model_args.precision),
217217
"device_map": teacher_device_map,
@@ -233,7 +233,7 @@ def initialize_model_from_path(
233233

234234
model_kwargs = {
235235
"config": config,
236-
"cache_dir": model_args.cache_dir,
236+
"cache_dir": None,
237237
"revision": model_args.model_revision,
238238
"use_auth_token": True if model_args.use_auth_token else None,
239239
"torch_dtype": parse_dtype(model_args.precision),
@@ -266,7 +266,7 @@ def initialize_processor_from_path(
266266
try:
267267
processor = AutoProcessor.from_pretrained(
268268
processor_src,
269-
cache_dir=model_args.cache_dir,
269+
cache_dir=None,
270270
use_fast=True,
271271
revision=model_args.model_revision,
272272
use_auth_token=True if model_args.use_auth_token else None,
@@ -285,7 +285,7 @@ def initialize_processor_from_path(
285285
logger.debug("Could not load fast processor, loading slow processor instead")
286286
processor = AutoProcessor.from_pretrained(
287287
processor_src,
288-
cache_dir=model_args.cache_dir,
288+
cache_dir=None,
289289
use_fast=False,
290290
revision=model_args.model_revision,
291291
use_auth_token=True if model_args.use_auth_token else None,

src/llmcompressor/pytorch/model_load/helpers.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,16 +149,17 @@ def copy_python_files_from_model_cache(model, save_path: str):
149149
import shutil
150150

151151
from huggingface_hub import hf_hub_download
152-
from transformers import TRANSFORMERS_CACHE
153152
from transformers.utils import http_user_agent
154153

155154
cache_path = config._name_or_path
156155
if not os.path.exists(cache_path):
157156
user_agent = http_user_agent()
157+
# Use cache_dir=None to respect HF_HOME, HF_HUB_CACHE, and other
158+
# environment variables for cache location
158159
config_file_path = hf_hub_download(
159160
repo_id=cache_path,
160161
filename="config.json",
161-
cache_dir=TRANSFORMERS_CACHE,
162+
cache_dir=None,
162163
force_download=False,
163164
user_agent=user_agent,
164165
)

src/llmcompressor/transformers/finetune/data/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ def load_dataset(self):
195195
logger.debug(f"Loading dataset {self.dataset_args.dataset}")
196196
return get_raw_dataset(
197197
self.dataset_args,
198-
None,
198+
cache_dir=None,
199199
split=self.split,
200200
streaming=self.dataset_args.streaming,
201201
**self.dataset_args.raw_kwargs,

tests/llmcompressor/transformers/finetune/data/conftest.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ def tiny_llama_model_args(tiny_llama_path):
1818
def tiny_llama_tokenizer(tiny_llama_model_args):
1919
tokenizer = AutoTokenizer.from_pretrained(
2020
tiny_llama_model_args.model,
21-
cache_dir=tiny_llama_model_args.cache_dir,
2221
use_fast=True,
2322
revision=tiny_llama_model_args.model_revision,
2423
use_auth_token=True if tiny_llama_model_args.use_auth_token else None,

0 commit comments

Comments
 (0)