[Cache] Fix environment variable handling for offline mode (#1902)

ralphbean · claude · brian-dellabetta · web-flow · commit 0e4d3a0d3aed · 2025-10-30T18:34:55.000Z
SUMMARY: Previously, llm-compressor ignored HF_HUB_CACHE and other environment variables when loading models and datasets, making offline mode difficult to use with unified cache directories. This change: - Removes hard-coded TRANSFORMERS_CACHE in model_load/helpers.py to respect HF_HOME, HF_HUB_CACHE environment variables - Propagates cache_dir from model_args to dataset_args to enable unified cache directory for both models and datasets - Updates dataset loading to use cache_dir parameter instead of hardcoded None Now users can specify cache_dir parameter or use HF_HOME/HF_HUB_CACHE environment variables for true offline operation. Offline mode is super helpful to supply-chain security use cases. It helps us generate trustworthy SBOMs for AI stuff. 🔐 🧠 TEST PLAN: I start with the oneshot example from the README, and called it `example.py`: ```python """ This is the example from the README """ from llmcompressor.modifiers.smoothquant import SmoothQuantModifier from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor import oneshot recipe = [ SmoothQuantModifier(smoothing_strength=0.8), GPTQModifier(scheme="W8A8", targets="Linear", ignore=["lm_head"]), ] oneshot( model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", dataset="open_platypus", recipe=recipe, output_dir="TinyLlama-1.1B-Chat-v1.0-INT8", max_seq_length=2048, num_calibration_samples=512, ) ``` Next, remove your hf local cache to ensure your system has nothing available to it yet: ```bash ❯ rm -rf ~/.cache/huggingface ``` Then, run `example.py` with the HF_HUB_OFFLINE=1 env var. This should fail, proving that you have nothing cached. ```bash ❯ HF_HUB_OFFLINE=1 python example.py Traceback (most recent call last): File "/home/rbean/code/llm-compressor/testtest/lib64/python3.13/site-packages/transformers/utils/hub.py", line 479, in cached_files ... <snip> ... OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files. Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. ``` Good. Now, run it with `HF_HUB=./hf-hub` which will run it in online mode, populating the cache in a new non-standard location (just to be sure things don't get mixed up during our test): ```bash ❯ HF_HOME=./hf-hub python example.py <lots of downloading happens, but you can ctrl-C when it gets into the real compression work> ``` Now, finally, you can run with both HF_HOME and HF_HUB_OFFLINE=1 and prove to yourself that llm-compressor uses that freshly-populated cache for both the model and the dataset. ```bash ❯ HF_HOME=./hf-hub HF_HUB_OFFLINE=1 python example.py <it works!> ``` --------- Signed-off-by: Ralph Bean <rbean@redhat.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py
@@ -50,10 +50,6 @@ class ModelArguments:
             "help": "Pretrained processor name or path if not the same as model_name"
         },
     )
-    cache_dir: str | None = field(
-        default=None,
-        metadata={"help": "Where to store the pretrained data from huggingface.co"},
-    )
 
     use_auth_token: bool = field(
         default=False,
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -231,7 +231,6 @@ def oneshot(
     config_name: Optional[str] = None,
     tokenizer: Optional[Union[str, PreTrainedTokenizerBase]] = None,
     processor: Optional[Union[str, ProcessorMixin]] = None,
-    cache_dir: Optional[str] = None,
     use_auth_token: bool = False,
     precision: str = "auto",
     tie_word_embeddings: bool = False,
@@ -279,8 +278,6 @@ def oneshot(
         model_name.
     :param processor: Pretrained processor name or path if not the same as
         model_name.
-    :param cache_dir: Where to store the pretrained data from
-        huggingface.co.
     :param use_auth_token: Whether to use Hugging Face auth token for private
         models.
     :param precision: Precision to cast model weights to, default to auto.
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -175,7 +175,7 @@ def initialize_model_from_path(
     model_path = model_args.model
     config = AutoConfig.from_pretrained(
         model_args.config_name if model_args.config_name else model_path,
-        cache_dir=model_args.cache_dir,
+        cache_dir=None,
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
         trust_remote_code=model_args.trust_remote_code_model,
@@ -211,7 +211,7 @@ def initialize_model_from_path(
             )
             teacher_kwargs = {
                 "config": teacher_config,
-                "cache_dir": model_args.cache_dir,
+                "cache_dir": None,
                 "use_auth_token": True if model_args.use_auth_token else None,
                 "torch_dtype": parse_dtype(model_args.precision),
                 "device_map": teacher_device_map,
@@ -233,7 +233,7 @@ def initialize_model_from_path(
 
     model_kwargs = {
         "config": config,
-        "cache_dir": model_args.cache_dir,
+        "cache_dir": None,
         "revision": model_args.model_revision,
         "use_auth_token": True if model_args.use_auth_token else None,
         "torch_dtype": parse_dtype(model_args.precision),
@@ -266,7 +266,7 @@ def initialize_processor_from_path(
     try:
         processor = AutoProcessor.from_pretrained(
             processor_src,
-            cache_dir=model_args.cache_dir,
+            cache_dir=None,
             use_fast=True,
             revision=model_args.model_revision,
             use_auth_token=True if model_args.use_auth_token else None,
@@ -285,7 +285,7 @@ def initialize_processor_from_path(
         logger.debug("Could not load fast processor, loading slow processor instead")
         processor = AutoProcessor.from_pretrained(
             processor_src,
-            cache_dir=model_args.cache_dir,
+            cache_dir=None,
             use_fast=False,
             revision=model_args.model_revision,
             use_auth_token=True if model_args.use_auth_token else None,
diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py
@@ -149,16 +149,17 @@ def copy_python_files_from_model_cache(model, save_path: str):
         import shutil
 
         from huggingface_hub import hf_hub_download
-        from transformers import TRANSFORMERS_CACHE
         from transformers.utils import http_user_agent
 
         cache_path = config._name_or_path
         if not os.path.exists(cache_path):
             user_agent = http_user_agent()
+            # Use cache_dir=None to respect HF_HOME, HF_HUB_CACHE, and other
+            # environment variables for cache location
             config_file_path = hf_hub_download(
                 repo_id=cache_path,
                 filename="config.json",
-                cache_dir=TRANSFORMERS_CACHE,
+                cache_dir=None,
                 force_download=False,
                 user_agent=user_agent,
             )
diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py
@@ -195,7 +195,7 @@ def load_dataset(self):
         logger.debug(f"Loading dataset {self.dataset_args.dataset}")
         return get_raw_dataset(
             self.dataset_args,
-            None,
+            cache_dir=None,
             split=self.split,
             streaming=self.dataset_args.streaming,
             **self.dataset_args.raw_kwargs,
diff --git a/tests/llmcompressor/transformers/finetune/data/conftest.py b/tests/llmcompressor/transformers/finetune/data/conftest.py
@@ -18,7 +18,6 @@ def tiny_llama_model_args(tiny_llama_path):
 def tiny_llama_tokenizer(tiny_llama_model_args):
     tokenizer = AutoTokenizer.from_pretrained(
         tiny_llama_model_args.model,
-        cache_dir=tiny_llama_model_args.cache_dir,
         use_fast=True,
         revision=tiny_llama_model_args.model_revision,
         use_auth_token=True if tiny_llama_model_args.use_auth_token else None,