|
37 | 37 |
|
38 | 38 | logging.basicConfig(level=logging.DEBUG)
|
39 | 39 | logger = logging.getLogger("convert-hf-to-gguf-update")
|
| 40 | +sess = requests.Session() |
40 | 41 |
|
41 | 42 |
|
42 | 43 | class TOKENIZER_TYPE(IntEnum):
|
@@ -81,60 +82,42 @@ class TOKENIZER_TYPE(IntEnum):
|
81 | 82 | {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
82 | 83 | ]
|
83 | 84 |
|
84 |
| -# make directory "models/tokenizers" if it doesn't exist |
85 |
| -if not os.path.exists("models/tokenizers"): |
86 |
| - os.makedirs("models/tokenizers") |
87 |
| - |
88 | 85 |
|
89 | 86 | def download_file_with_auth(url, token, save_path):
|
90 | 87 | headers = {"Authorization": f"Bearer {token}"}
|
91 |
| - response = requests.get(url, headers=headers) |
92 |
| - if response.status_code == 200: |
93 |
| - with open(save_path, 'wb') as f: |
94 |
| - f.write(response.content) |
95 |
| - logger.info(f"File {save_path} downloaded successfully") |
96 |
| - else: |
97 |
| - logger.info(f"Failed to download file. Status code: {response.status_code}") |
| 88 | + response = sess.get(url, headers=headers) |
| 89 | + response.raise_for_status() |
| 90 | + os.makedirs(os.path.dirname(save_path), exist_ok=True) |
| 91 | + with open(save_path, 'wb') as f: |
| 92 | + f.write(response.content) |
| 93 | + logger.info(f"File {save_path} downloaded successfully") |
98 | 94 |
|
99 | 95 |
|
100 |
| -# download the tokenizer models |
101 |
| -for model in models: |
| 96 | +def download_model(model): |
102 | 97 | name = model["name"]
|
103 | 98 | repo = model["repo"]
|
104 | 99 | tokt = model["tokt"]
|
105 | 100 |
|
106 |
| - if not os.path.exists(f"models/tokenizers/{name}"): |
107 |
| - os.makedirs(f"models/tokenizers/{name}") |
108 |
| - else: |
109 |
| - logger.info(f"Directory models/tokenizers/{name} already exists - skipping") |
110 |
| - continue |
111 |
| - |
112 |
| - logger.info(f"Downloading {name} to models/tokenizers/{name}") |
| 101 | + os.makedirs(f"models/tokenizers/{name}", exist_ok=True) |
113 | 102 |
|
114 |
| - url = f"{repo}/raw/main/config.json" |
115 |
| - save_path = f"models/tokenizers/{name}/config.json" |
116 |
| - download_file_with_auth(url, token, save_path) |
| 103 | + files = ["config.json", "tokenizer.json", "tokenizer_config.json"] |
| 104 | + if tokt == TOKENIZER_TYPE.SPM: |
| 105 | + files.append("tokenizer.model") |
117 | 106 |
|
118 |
| - url = f"{repo}/raw/main/tokenizer.json" |
119 |
| - save_path = f"models/tokenizers/{name}/tokenizer.json" |
120 |
| - download_file_with_auth(url, token, save_path) |
| 107 | + for file in files: |
| 108 | + save_path = f"models/tokenizers/{name}/{file}" |
| 109 | + if os.path.isfile(save_path): |
| 110 | + logger.info(f"{name}: File {save_path} already exists - skipping") |
| 111 | + continue |
| 112 | + download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) |
121 | 113 |
|
122 |
| - # if downloaded file is less than 1KB, we likely need to download an LFS instead |
123 |
| - if os.path.getsize(save_path) < 1024: |
124 |
| - # remove the file |
125 |
| - os.remove(save_path) |
126 |
| - url = f"{repo}/resolve/main/tokenizer.json" |
127 |
| - save_path = f"models/tokenizers/{name}/tokenizer.json" |
128 |
| - download_file_with_auth(url, token, save_path) |
129 | 114 |
|
130 |
| - if tokt == TOKENIZER_TYPE.SPM: |
131 |
| - url = f"{repo}/resolve/main/tokenizer.model" |
132 |
| - save_path = f"models/tokenizers/{name}/tokenizer.model" |
133 |
| - download_file_with_auth(url, token, save_path) |
| 115 | +for model in models: |
| 116 | + try: |
| 117 | + download_model(model) |
| 118 | + except Exception as e: |
| 119 | + logger.error(f"Failed to download model {model['name']}. Error: {e}") |
134 | 120 |
|
135 |
| - url = f"{repo}/raw/main/tokenizer_config.json" |
136 |
| - save_path = f"models/tokenizers/{name}/tokenizer_config.json" |
137 |
| - download_file_with_auth(url, token, save_path) |
138 | 121 |
|
139 | 122 | # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
140 | 123 |
|
|
0 commit comments