Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions .github/workflows/pre-tokenizer-hashes.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: Check Pre-Tokenizer Hashes

on:
push:
paths:
- 'convert_hf_to_gguf.py'
- 'convert_hf_to_gguf_update.py'
pull_request:
paths:
- 'convert_hf_to_gguf.py'
- 'convert_hf_to_gguf_update.py'

jobs:
pre-tokenizer-hashes:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install Python dependencies
run: |
python3 -m venv .venv
.venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt

- name: Update pre-tokenizer hashes
run: |
cp convert_hf_to_gguf.py /tmp
.venv/bin/python convert_hf_to_gguf_update.py --check-missing

- name: Check if committed pre-tokenizer hashes matches generated version
run: |
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
echo "Differences found:"
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
exit 1
fi
echo "Model pre-tokenizer hashes are up to date."
21 changes: 15 additions & 6 deletions convert_hf_to_gguf_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ class TOKENIZER_TYPE(IntEnum):
"--full", action="store_true",
help="download full list of models - make sure you have access to all of them",
)
parser.add_argument(
"--check-missing", action="store_true",
help="only check for missing pre-tokenizer hashes",
)
parser.add_argument(
"hf_token",
help="optional HF token",
Expand All @@ -70,6 +74,10 @@ class TOKENIZER_TYPE(IntEnum):
if hf_token is None:
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")

if args.check_missing and args.full:
logger.warning("Downloading full list of models requested, ignoring --check-missing!")
args.check_missing = False

# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
Expand Down Expand Up @@ -222,12 +230,13 @@ def get_existing_models(convert_py):
all_models = models.copy()
models = [model for model in all_models if model["name"] not in existing_models]

logging.info(f"Downloading {len(models)} models...")
for model in models:
try:
download_model(model)
except Exception as e:
logger.error(f"Failed to download model {model['name']}. Error: {e}")
if not args.check_missing:
logging.info(f"Downloading {len(models)} models...")
for model in models:
try:
download_model(model)
except Exception as e:
logger.error(f"Failed to download model {model['name']}. Error: {e}")


# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
Expand Down
6 changes: 0 additions & 6 deletions requirements/requirements-convert_hf_to_gguf_update.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1 @@
-r ./requirements-convert_legacy_llama.txt
--extra-index-url https://download.pytorch.org/whl/cpu
torch~=2.2.1; platform_machine != "s390x"

# torch s390x packages can only be found from nightly builds
--extra-index-url https://download.pytorch.org/whl/nightly
torch>=0.0.0.dev0; platform_machine == "s390x"