Skip to content

Commit 3afb494

Browse files
committed
convert-hf-to-gguf-update: improve download
* share requests session for performance * create directories only when needed, don't skip downloads when empty directory encountered * be more graceful about errors
1 parent 86016b7 commit 3afb494

File tree

1 file changed

+23
-40
lines changed

1 file changed

+23
-40
lines changed

convert-hf-to-gguf-update.py

Lines changed: 23 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737

3838
logging.basicConfig(level=logging.DEBUG)
3939
logger = logging.getLogger("convert-hf-to-gguf-update")
40+
sess = requests.Session()
4041

4142

4243
class TOKENIZER_TYPE(IntEnum):
@@ -81,60 +82,42 @@ class TOKENIZER_TYPE(IntEnum):
8182
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
8283
]
8384

84-
# make directory "models/tokenizers" if it doesn't exist
85-
if not os.path.exists("models/tokenizers"):
86-
os.makedirs("models/tokenizers")
87-
8885

8986
def download_file_with_auth(url, token, save_path):
9087
headers = {"Authorization": f"Bearer {token}"}
91-
response = requests.get(url, headers=headers)
92-
if response.status_code == 200:
93-
with open(save_path, 'wb') as f:
94-
f.write(response.content)
95-
logger.info(f"File {save_path} downloaded successfully")
96-
else:
97-
logger.info(f"Failed to download file. Status code: {response.status_code}")
88+
response = sess.get(url, headers=headers)
89+
response.raise_for_status()
90+
os.makedirs(os.path.dirname(save_path), exist_ok=True)
91+
with open(save_path, 'wb') as f:
92+
f.write(response.content)
93+
logger.info(f"File {save_path} downloaded successfully")
9894

9995

100-
# download the tokenizer models
101-
for model in models:
96+
def download_model(model):
10297
name = model["name"]
10398
repo = model["repo"]
10499
tokt = model["tokt"]
105100

106-
if not os.path.exists(f"models/tokenizers/{name}"):
107-
os.makedirs(f"models/tokenizers/{name}")
108-
else:
109-
logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
110-
continue
111-
112-
logger.info(f"Downloading {name} to models/tokenizers/{name}")
101+
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
113102

114-
url = f"{repo}/raw/main/config.json"
115-
save_path = f"models/tokenizers/{name}/config.json"
116-
download_file_with_auth(url, token, save_path)
103+
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
104+
if tokt == TOKENIZER_TYPE.SPM:
105+
files.append("tokenizer.model")
117106

118-
url = f"{repo}/raw/main/tokenizer.json"
119-
save_path = f"models/tokenizers/{name}/tokenizer.json"
120-
download_file_with_auth(url, token, save_path)
107+
for file in files:
108+
save_path = f"models/tokenizers/{name}/{file}"
109+
if os.path.isfile(save_path):
110+
logger.info(f"{name}: File {save_path} already exists - skipping")
111+
continue
112+
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
121113

122-
# if downloaded file is less than 1KB, we likely need to download an LFS instead
123-
if os.path.getsize(save_path) < 1024:
124-
# remove the file
125-
os.remove(save_path)
126-
url = f"{repo}/resolve/main/tokenizer.json"
127-
save_path = f"models/tokenizers/{name}/tokenizer.json"
128-
download_file_with_auth(url, token, save_path)
129114

130-
if tokt == TOKENIZER_TYPE.SPM:
131-
url = f"{repo}/resolve/main/tokenizer.model"
132-
save_path = f"models/tokenizers/{name}/tokenizer.model"
133-
download_file_with_auth(url, token, save_path)
115+
for model in models:
116+
try:
117+
download_model(model)
118+
except Exception as e:
119+
logger.error(f"Failed to download model {model['name']}. Error: {e}")
134120

135-
url = f"{repo}/raw/main/tokenizer_config.json"
136-
save_path = f"models/tokenizers/{name}/tokenizer_config.json"
137-
download_file_with_auth(url, token, save_path)
138121

139122
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
140123

0 commit comments

Comments
 (0)